reimplement crawler in golang. Working state

2018-04-03 18:57:46 +02:00
parent 69ceef6782
commit 5faf3f22f5
2 changed files with 152 additions and 165 deletions
--- a/Python/Konachan-Downloader_v3.py
+++ b/Python/Konachan-Downloader_v3.py
@@ -1,165 +0,0 @@
 #! /usr/bin/python
 from __future__ import print_function
 import re
 import os
 import os.path
 import sys
 import http.client
 import urllib.request
 import time
 # regexes
 url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)")
 name_regex = re.compile("image/.*?/(.*)")
 # variable
 counter1, counter2 = 0, 15000
 tag_filter = None
 konachan = None
 # little function to calculate the last page of search results
 def page_count():
    # open connection to konachan.com
    domain = http.client.HTTPConnection(konachan)
    domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
    while True:
        try:
            first_page = domain.getresponse()
            break
        except http.client.BadStatusLine:
            time.sleep(1)
            domain.close()
            domain = http.client.HTTPConnection(konachan)
            domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
    # we got our response, now it's time to find that number
    first_page_source = str(first_page.read())
    page_list = first_page_source.split("Next Page")
    number = 0
    for line in page_list:
        if re.search("(?<=\/post\?page\=)\d+", line):
            number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
        else:
            number = 2
    return int(number)
 # we don't want to save every picture in one directory.
 # so we create a new directory when we donwloaded 15k pics
 def directory_size(directory_intern):
    if len(os.listdir(directory_intern)) >= 15000:
        print("Directory " + directory_intern + " full")
        counter1 += 15000
        counter2 += 15000
        directory = "Pics " + str(counter1) + " - " + str(counter2)
        if os.path.isdir(directory):
            print("Directory already exists; skip creation")
        else:
            os.makedirs(directory, 0o755, False)
        os.chdir("..")
 # now we start
 # user has to set path for pictures
 print("Please set download location (full path required): ")
 path = sys.stdin.readline()
 # set tags, if user want to download specific pictures
 print("Set Tags (seperate multiple tags with a whitespace; " +
    "connect tags with more than one word with an underscore): ")
 tags = sys.stdin.readline().strip("\n")
 # ask if they want to use the safe mode or not
 print("Are you wanna use the safe mode of konachan? [yes/no]")
 safemode = sys.stdin.readline().strip("\n")
 if safemode == "yes":
    konachan = "konachan.net"
 else:
    konachan = "konachan.com"
 domain = http.client.HTTPConnection(konachan)
 # chdir in $path and create directory if it not exists
 dir_tags = tags.replace(":", "_")
 if not os.path.isdir(path.rstrip()):
    os.makedirs(path.rstrip(), 0o755, True)
 os.chdir(path.rstrip())
 if safemode == "yes":
    if not os.path.isdir("Safemode Tags " + dir_tags):
        os.makedirs("Safemode Tags " + dir_tags, 0o755, True)
    os.chdir("Safemode Tags " + dir_tags)
 else:
    if not os.path.isdir("Tags " + dir_tags):
        os.makedirs("Tags " + dir_tags, 0o755, True)
    os.chdir("Tags " + dir_tags)
 # creating directory for pics
 directory = "Pics " + str(counter1) + " - " + str(counter2)
 if not os.path.isdir(directory):
    os.makedirs(directory, 0o755, True)
 # let's start with downloading
 for page_number in range(1, page_count()):
    print("Starting download in page " + str(page_number))
    domain.request("GET", "/post?page=" + str(page_number) +
    "&tags=" + tags.replace(" ", "+"))
    while True:
        try:
            index_page = domain.getresponse()
            break
        except http.client.BadStatusLine:
            domain.close()
            domain = http.client.HTTPConnection(konachan)
            domain.request("GET", "/post?page=" + str(page_number) +
             "&tags=" + tags.replace(" ", "+"))
            time.sleep(1)
    # after we got the response from konachan we need the source code
    index_page_source = str(index_page.read())
    # and now we need save every link on this page in a list
    pics_list = index_page_source.split("Post.register")
    directory_size(directory)
    # now we can search every line for the pic link
    for pic in pics_list:
        pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
        # if we found the url we download the pic
        # but with whitespaces instead of "%20"
        if pic_url:
            name = name_regex.search(pic_url.group(0)).group(1)
            print("     Downloading pic:  " + name.replace("%20", " ") +
            " in directory: " + directory)
            # a little check if pic already exists
            existance = False
            for dir in os.listdir():
                os.chdir(dir)
                if os.path.isfile(name.replace("%20", " ")):
                    print("     Pic is already on your pc! Skip!")
                    existance = True
                os.chdir("..")
            if not existance:
                os.chdir(directory)
                image = urllib.request.URLopener()
                image.retrieve("http://" +
                    pic_url.group(0), urllib.request.url2pathname(name))
                print("     Download finished")
                os.chdir("..")
--- a/imagecrawler.go
+++ b/imagecrawler.go
@@ -0,0 +1,152 @@
 package main
 import (
 	"encoding/json"
 	"flag"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"net/url"
 	"os"
 	"os/user"
 	"strings"
 	"sync"
 )
 // Global vars
 var wg sync.WaitGroup
 type picture struct {
 	FileURL string `json:"file_url"`
 }
 // main function to download pictures
 func main() {
 	// define flags and parse them
 	var path string
 	var safemode bool
 	var tags string
 	flag.StringVar(&path, "dir", "unnamed", "Directory to safe pictures. Default is %HOME/pictures/konachan/unnamed")
 	flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false")
 	flag.StringVar(&tags, "tags", "", "Tags used to filter search query.")
 	flag.Parse()
 	// set home directory and create it to save pictures in
 	homepath, err := user.Current()
 	if err != nil {
 		log.Fatal(err)
 	}
 	filepath := strings.Join([]string{homepath.HomeDir, "pictures", "konachan", strings.TrimSuffix(path, "\n")}, "/")
 	os.MkdirAll(filepath, 0700)
 	// edit tags array to met API requirement
 	tags = strings.Replace(tags, ",", "+", -1)
 	tags = strings.TrimSuffix(tags, "\n")
 	fmt.Println("Starting to crawl :D")
 	picHits := 1
 	page := 1
 	for picHits > 0 {
 		fmt.Println("Page: ", page)
 		website := fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s", page, tags)
 		if safemode {
 			website = fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s+rating:safe", page, tags)
 		}
 		picList := openConnection(website)
 		pictures := parseMaps(picList)
 		picHits = len(pictures)
 		page++
 		wg.Add(len(pictures))
 		for _, pic := range pictures {
 			go downloadPic(pic, filepath)
 		}
 		wg.Wait()
 	}
 }
 // function to create the connection to konachan and get the API response
 func openConnection(url string) []picture {
 	var f []picture
 	result, err := http.Get(url)
 	if err != nil {
 		log.Fatal(err)
 	}
 	defer result.Body.Close()
 	data, err := ioutil.ReadAll(result.Body)
 	if err != nil {
 		log.Fatal(err)
 	}
 	if err = json.Unmarshal(data, &f); err != nil {
 		panic(err)
 	}
 	return f
 }
 // function to parse the json response and extract only the file url
 func parseMaps(f []picture) []string {
 	fileURLs := []string{}
 	for _, pic := range f {
 		fileURL := pic.FileURL
 		fileURLs = append(fileURLs, fileURL)
 	}
 	return fileURLs
 }
 // function to download and sace the pictures to disk
 func downloadPic(picURL string, filepath string) {
 	defer wg.Done()
 	picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1])
 	if err != nil {
 		log.Fatal(err)
 	}
 	if _, err := os.Stat(filepath + "/" + picName); err == nil {
 		return
 	}
 	result, err := http.Get(picURL)
 	if err != nil {
 		log.Fatal(err)
 	}
 	defer result.Body.Close()
 	//fmt.Println(result.Status)
 	if result.StatusCode != 200 {
 		wg.Add(1)
 		go downloadPic(picURL, filepath)
 		return
 	}
 	file, err := os.Create(filepath + "/" + picName)
 	if err != nil {
 		log.Fatal(err)
 	}
 	_, err = io.Copy(file, result.Body)
 	if err != nil {
 		log.Fatal(err)
 	}
 	file.Close()
 	fmt.Printf("Downloading: %s\n", picName)
 }