From 5faf3f22f5104d3fd3cc83853f28549afdfbc7fe Mon Sep 17 00:00:00 2001 From: M3philis Date: Tue, 3 Apr 2018 18:57:46 +0200 Subject: [PATCH] reimplement crawler in golang. Working state --- Python/Konachan-Downloader_v3.py | 165 ------------------------------- imagecrawler.go | 152 ++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 165 deletions(-) delete mode 100755 Python/Konachan-Downloader_v3.py create mode 100755 imagecrawler.go diff --git a/Python/Konachan-Downloader_v3.py b/Python/Konachan-Downloader_v3.py deleted file mode 100755 index 4ac99b2..0000000 --- a/Python/Konachan-Downloader_v3.py +++ /dev/null @@ -1,165 +0,0 @@ -#! /usr/bin/python - -from __future__ import print_function - -import re -import os -import os.path -import sys -import http.client -import urllib.request -import time - -# regexes -url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)") -name_regex = re.compile("image/.*?/(.*)") - -# variable -counter1, counter2 = 0, 15000 -tag_filter = None -konachan = None - - -# little function to calculate the last page of search results -def page_count(): - # open connection to konachan.com - domain = http.client.HTTPConnection(konachan) - - domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+")) - while True: - try: - first_page = domain.getresponse() - break - except http.client.BadStatusLine: - time.sleep(1) - domain.close() - domain = http.client.HTTPConnection(konachan) - domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+")) - - # we got our response, now it's time to find that number - first_page_source = str(first_page.read()) - page_list = first_page_source.split("Next Page") - number = 0 - for line in page_list: - if re.search("(?<=\/post\?page\=)\d+", line): - number = re.search("(?<=\/post\?page\=)\d+", line).group(0) - else: - number = 2 - return int(number) - -# we don't want to save every picture in one directory. -# so we create a new directory when we donwloaded 15k pics - - -def directory_size(directory_intern): - if len(os.listdir(directory_intern)) >= 15000: - print("Directory " + directory_intern + " full") - counter1 += 15000 - counter2 += 15000 - directory = "Pics " + str(counter1) + " - " + str(counter2) - if os.path.isdir(directory): - print("Directory already exists; skip creation") - else: - os.makedirs(directory, 0o755, False) - os.chdir("..") - -# now we start - - -# user has to set path for pictures -print("Please set download location (full path required): ") -path = sys.stdin.readline() - -# set tags, if user want to download specific pictures -print("Set Tags (seperate multiple tags with a whitespace; " + - "connect tags with more than one word with an underscore): ") -tags = sys.stdin.readline().strip("\n") - -# ask if they want to use the safe mode or not -print("Are you wanna use the safe mode of konachan? [yes/no]") -safemode = sys.stdin.readline().strip("\n") - -if safemode == "yes": - konachan = "konachan.net" -else: - konachan = "konachan.com" - -domain = http.client.HTTPConnection(konachan) - -# chdir in $path and create directory if it not exists - -dir_tags = tags.replace(":", "_") - -if not os.path.isdir(path.rstrip()): - os.makedirs(path.rstrip(), 0o755, True) -os.chdir(path.rstrip()) -if safemode == "yes": - if not os.path.isdir("Safemode Tags " + dir_tags): - os.makedirs("Safemode Tags " + dir_tags, 0o755, True) - os.chdir("Safemode Tags " + dir_tags) -else: - if not os.path.isdir("Tags " + dir_tags): - os.makedirs("Tags " + dir_tags, 0o755, True) - os.chdir("Tags " + dir_tags) - - -# creating directory for pics -directory = "Pics " + str(counter1) + " - " + str(counter2) -if not os.path.isdir(directory): - os.makedirs(directory, 0o755, True) - -# let's start with downloading - -for page_number in range(1, page_count()): - - print("Starting download in page " + str(page_number)) - - domain.request("GET", "/post?page=" + str(page_number) + - "&tags=" + tags.replace(" ", "+")) - - while True: - try: - index_page = domain.getresponse() - break - except http.client.BadStatusLine: - domain.close() - domain = http.client.HTTPConnection(konachan) - domain.request("GET", "/post?page=" + str(page_number) + - "&tags=" + tags.replace(" ", "+")) - time.sleep(1) - - # after we got the response from konachan we need the source code - index_page_source = str(index_page.read()) - - # and now we need save every link on this page in a list - pics_list = index_page_source.split("Post.register") - - directory_size(directory) - - # now we can search every line for the pic link - for pic in pics_list: - pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic)) - - # if we found the url we download the pic - # but with whitespaces instead of "%20" - if pic_url: - name = name_regex.search(pic_url.group(0)).group(1) - print(" Downloading pic: " + name.replace("%20", " ") + - " in directory: " + directory) - - # a little check if pic already exists - existance = False - for dir in os.listdir(): - os.chdir(dir) - if os.path.isfile(name.replace("%20", " ")): - print(" Pic is already on your pc! Skip!") - existance = True - os.chdir("..") - - if not existance: - os.chdir(directory) - image = urllib.request.URLopener() - image.retrieve("http://" + - pic_url.group(0), urllib.request.url2pathname(name)) - print(" Download finished") - os.chdir("..") diff --git a/imagecrawler.go b/imagecrawler.go new file mode 100755 index 0000000..5378767 --- /dev/null +++ b/imagecrawler.go @@ -0,0 +1,152 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "net/url" + "os" + "os/user" + "strings" + "sync" +) + +// Global vars +var wg sync.WaitGroup + +type picture struct { + FileURL string `json:"file_url"` +} + +// main function to download pictures +func main() { + + // define flags and parse them + var path string + var safemode bool + var tags string + + flag.StringVar(&path, "dir", "unnamed", "Directory to safe pictures. Default is %HOME/pictures/konachan/unnamed") + flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false") + flag.StringVar(&tags, "tags", "", "Tags used to filter search query.") + + flag.Parse() + + // set home directory and create it to save pictures in + homepath, err := user.Current() + if err != nil { + log.Fatal(err) + } + filepath := strings.Join([]string{homepath.HomeDir, "pictures", "konachan", strings.TrimSuffix(path, "\n")}, "/") + os.MkdirAll(filepath, 0700) + + // edit tags array to met API requirement + tags = strings.Replace(tags, ",", "+", -1) + tags = strings.TrimSuffix(tags, "\n") + + fmt.Println("Starting to crawl :D") + + picHits := 1 + page := 1 + + for picHits > 0 { + + fmt.Println("Page: ", page) + + website := fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s", page, tags) + if safemode { + website = fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s+rating:safe", page, tags) + } + + picList := openConnection(website) + pictures := parseMaps(picList) + + picHits = len(pictures) + page++ + + wg.Add(len(pictures)) + for _, pic := range pictures { + go downloadPic(pic, filepath) + } + wg.Wait() + } +} + +// function to create the connection to konachan and get the API response +func openConnection(url string) []picture { + var f []picture + + result, err := http.Get(url) + if err != nil { + log.Fatal(err) + } + defer result.Body.Close() + + data, err := ioutil.ReadAll(result.Body) + if err != nil { + log.Fatal(err) + } + + if err = json.Unmarshal(data, &f); err != nil { + panic(err) + } + + return f +} + +// function to parse the json response and extract only the file url +func parseMaps(f []picture) []string { + fileURLs := []string{} + for _, pic := range f { + fileURL := pic.FileURL + fileURLs = append(fileURLs, fileURL) + } + + return fileURLs +} + +// function to download and sace the pictures to disk +func downloadPic(picURL string, filepath string) { + defer wg.Done() + + picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1]) + if err != nil { + log.Fatal(err) + } + + if _, err := os.Stat(filepath + "/" + picName); err == nil { + return + } + + result, err := http.Get(picURL) + if err != nil { + log.Fatal(err) + } + defer result.Body.Close() + + //fmt.Println(result.Status) + + if result.StatusCode != 200 { + wg.Add(1) + go downloadPic(picURL, filepath) + return + } + + file, err := os.Create(filepath + "/" + picName) + if err != nil { + log.Fatal(err) + } + + _, err = io.Copy(file, result.Body) + if err != nil { + log.Fatal(err) + } + + file.Close() + + fmt.Printf("Downloading: %s\n", picName) +}