From 5faf3f22f5104d3fd3cc83853f28549afdfbc7fe Mon Sep 17 00:00:00 2001
From: M3philis <m3philis@m3philis.de>
Date: Tue, 3 Apr 2018 18:57:46 +0200
Subject: [PATCH] reimplement crawler in golang. Working state

---
 Python/Konachan-Downloader_v3.py | 165 -------------------------------
 imagecrawler.go                  | 152 ++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+), 165 deletions(-)
 delete mode 100755 Python/Konachan-Downloader_v3.py
 create mode 100755 imagecrawler.go

diff --git a/Python/Konachan-Downloader_v3.py b/Python/Konachan-Downloader_v3.py
deleted file mode 100755
index 4ac99b2..0000000
--- a/Python/Konachan-Downloader_v3.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#! /usr/bin/python
-
-from __future__ import print_function
-
-import re
-import os
-import os.path
-import sys
-import http.client
-import urllib.request
-import time
-
-# regexes
-url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)")
-name_regex = re.compile("image/.*?/(.*)")
-
-# variable
-counter1, counter2 = 0, 15000
-tag_filter = None
-konachan = None
-
-
-# little function to calculate the last page of search results
-def page_count():
-    # open connection to konachan.com
-    domain = http.client.HTTPConnection(konachan)
-
-    domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
-    while True:
-        try:
-            first_page = domain.getresponse()
-            break
-        except http.client.BadStatusLine:
-            time.sleep(1)
-            domain.close()
-            domain = http.client.HTTPConnection(konachan)
-            domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
-
-    # we got our response, now it's time to find that number
-    first_page_source = str(first_page.read())
-    page_list = first_page_source.split("Next Page")
-    number = 0
-    for line in page_list:
-        if re.search("(?<=\/post\?page\=)\d+", line):
-            number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
-        else:
-            number = 2
-    return int(number)
-
-# we don't want to save every picture in one directory.
-# so we create a new directory when we donwloaded 15k pics
-
-
-def directory_size(directory_intern):
-    if len(os.listdir(directory_intern)) >= 15000:
-        print("Directory " + directory_intern + " full")
-        counter1 += 15000
-        counter2 += 15000
-        directory = "Pics " + str(counter1) + " - " + str(counter2)
-        if os.path.isdir(directory):
-            print("Directory already exists; skip creation")
-        else:
-            os.makedirs(directory, 0o755, False)
-        os.chdir("..")
-
-# now we start
-
-
-# user has to set path for pictures
-print("Please set download location (full path required): ")
-path = sys.stdin.readline()
-
-# set tags, if user want to download specific pictures
-print("Set Tags (seperate multiple tags with a whitespace; " +
-    "connect tags with more than one word with an underscore): ")
-tags = sys.stdin.readline().strip("\n")
-
-# ask if they want to use the safe mode or not
-print("Are you wanna use the safe mode of konachan? [yes/no]")
-safemode = sys.stdin.readline().strip("\n")
-
-if safemode == "yes":
-    konachan = "konachan.net"
-else:
-    konachan = "konachan.com"
-
-domain = http.client.HTTPConnection(konachan)
-
-# chdir in $path and create directory if it not exists
-
-dir_tags = tags.replace(":", "_")
-
-if not os.path.isdir(path.rstrip()):
-    os.makedirs(path.rstrip(), 0o755, True)
-os.chdir(path.rstrip())
-if safemode == "yes":
-    if not os.path.isdir("Safemode Tags " + dir_tags):
-        os.makedirs("Safemode Tags " + dir_tags, 0o755, True)
-    os.chdir("Safemode Tags " + dir_tags)
-else:
-    if not os.path.isdir("Tags " + dir_tags):
-        os.makedirs("Tags " + dir_tags, 0o755, True)
-    os.chdir("Tags " + dir_tags)
-
-
-# creating directory for pics
-directory = "Pics " + str(counter1) + " - " + str(counter2)
-if not os.path.isdir(directory):
-    os.makedirs(directory, 0o755, True)
-
-# let's start with downloading
-
-for page_number in range(1, page_count()):
-
-    print("Starting download in page " + str(page_number))
-
-    domain.request("GET", "/post?page=" + str(page_number) +
-    "&tags=" + tags.replace(" ", "+"))
-
-    while True:
-        try:
-            index_page = domain.getresponse()
-            break
-        except http.client.BadStatusLine:
-            domain.close()
-            domain = http.client.HTTPConnection(konachan)
-            domain.request("GET", "/post?page=" + str(page_number) +
-             "&tags=" + tags.replace(" ", "+"))
-            time.sleep(1)
-
-    # after we got the response from konachan we need the source code
-    index_page_source = str(index_page.read())
-
-    # and now we need save every link on this page in a list
-    pics_list = index_page_source.split("Post.register")
-
-    directory_size(directory)
-
-    # now we can search every line for the pic link
-    for pic in pics_list:
-        pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
-
-        # if we found the url we download the pic
-        # but with whitespaces instead of "%20"
-        if pic_url:
-            name = name_regex.search(pic_url.group(0)).group(1)
-            print("     Downloading pic:  " + name.replace("%20", " ") +
-            " in directory: " + directory)
-
-            # a little check if pic already exists
-            existance = False
-            for dir in os.listdir():
-                os.chdir(dir)
-                if os.path.isfile(name.replace("%20", " ")):
-                    print("     Pic is already on your pc! Skip!")
-                    existance = True
-                os.chdir("..")
-
-            if not existance:
-                os.chdir(directory)
-                image = urllib.request.URLopener()
-                image.retrieve("http://" +
-                    pic_url.group(0), urllib.request.url2pathname(name))
-                print("     Download finished")
-                os.chdir("..")
diff --git a/imagecrawler.go b/imagecrawler.go
new file mode 100755
index 0000000..5378767
--- /dev/null
+++ b/imagecrawler.go
@@ -0,0 +1,152 @@
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"net/url"
+	"os"
+	"os/user"
+	"strings"
+	"sync"
+)
+
+// Global vars
+var wg sync.WaitGroup
+
+type picture struct {
+	FileURL string `json:"file_url"`
+}
+
+// main function to download pictures
+func main() {
+
+	// define flags and parse them
+	var path string
+	var safemode bool
+	var tags string
+
+	flag.StringVar(&path, "dir", "unnamed", "Directory to safe pictures. Default is %HOME/pictures/konachan/unnamed")
+	flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false")
+	flag.StringVar(&tags, "tags", "", "Tags used to filter search query.")
+
+	flag.Parse()
+
+	// set home directory and create it to save pictures in
+	homepath, err := user.Current()
+	if err != nil {
+		log.Fatal(err)
+	}
+	filepath := strings.Join([]string{homepath.HomeDir, "pictures", "konachan", strings.TrimSuffix(path, "\n")}, "/")
+	os.MkdirAll(filepath, 0700)
+
+	// edit tags array to met API requirement
+	tags = strings.Replace(tags, ",", "+", -1)
+	tags = strings.TrimSuffix(tags, "\n")
+
+	fmt.Println("Starting to crawl :D")
+
+	picHits := 1
+	page := 1
+
+	for picHits > 0 {
+
+		fmt.Println("Page: ", page)
+
+		website := fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s", page, tags)
+		if safemode {
+			website = fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s+rating:safe", page, tags)
+		}
+
+		picList := openConnection(website)
+		pictures := parseMaps(picList)
+
+		picHits = len(pictures)
+		page++
+
+		wg.Add(len(pictures))
+		for _, pic := range pictures {
+			go downloadPic(pic, filepath)
+		}
+		wg.Wait()
+	}
+}
+
+// function to create the connection to konachan and get the API response
+func openConnection(url string) []picture {
+	var f []picture
+
+	result, err := http.Get(url)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer result.Body.Close()
+
+	data, err := ioutil.ReadAll(result.Body)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	if err = json.Unmarshal(data, &f); err != nil {
+		panic(err)
+	}
+
+	return f
+}
+
+// function to parse the json response and extract only the file url
+func parseMaps(f []picture) []string {
+	fileURLs := []string{}
+	for _, pic := range f {
+		fileURL := pic.FileURL
+		fileURLs = append(fileURLs, fileURL)
+	}
+
+	return fileURLs
+}
+
+// function to download and sace the pictures to disk
+func downloadPic(picURL string, filepath string) {
+	defer wg.Done()
+
+	picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1])
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	if _, err := os.Stat(filepath + "/" + picName); err == nil {
+		return
+	}
+
+	result, err := http.Get(picURL)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer result.Body.Close()
+
+	//fmt.Println(result.Status)
+
+	if result.StatusCode != 200 {
+		wg.Add(1)
+		go downloadPic(picURL, filepath)
+		return
+	}
+
+	file, err := os.Create(filepath + "/" + picName)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	_, err = io.Copy(file, result.Body)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	file.Close()
+
+	fmt.Printf("Downloading: %s\n", picName)
+}