reimplement crawler in golang. Working state
This commit is contained in:
@@ -1,165 +0,0 @@
|
|||||||
#! /usr/bin/python
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import os.path
|
|
||||||
import sys
|
|
||||||
import http.client
|
|
||||||
import urllib.request
|
|
||||||
import time
|
|
||||||
|
|
||||||
# regexes
|
|
||||||
url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)")
|
|
||||||
name_regex = re.compile("image/.*?/(.*)")
|
|
||||||
|
|
||||||
# variable
|
|
||||||
counter1, counter2 = 0, 15000
|
|
||||||
tag_filter = None
|
|
||||||
konachan = None
|
|
||||||
|
|
||||||
|
|
||||||
# little function to calculate the last page of search results
|
|
||||||
def page_count():
|
|
||||||
# open connection to konachan.com
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
|
|
||||||
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
first_page = domain.getresponse()
|
|
||||||
break
|
|
||||||
except http.client.BadStatusLine:
|
|
||||||
time.sleep(1)
|
|
||||||
domain.close()
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
|
||||||
|
|
||||||
# we got our response, now it's time to find that number
|
|
||||||
first_page_source = str(first_page.read())
|
|
||||||
page_list = first_page_source.split("Next Page")
|
|
||||||
number = 0
|
|
||||||
for line in page_list:
|
|
||||||
if re.search("(?<=\/post\?page\=)\d+", line):
|
|
||||||
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
|
|
||||||
else:
|
|
||||||
number = 2
|
|
||||||
return int(number)
|
|
||||||
|
|
||||||
# we don't want to save every picture in one directory.
|
|
||||||
# so we create a new directory when we donwloaded 15k pics
|
|
||||||
|
|
||||||
|
|
||||||
def directory_size(directory_intern):
|
|
||||||
if len(os.listdir(directory_intern)) >= 15000:
|
|
||||||
print("Directory " + directory_intern + " full")
|
|
||||||
counter1 += 15000
|
|
||||||
counter2 += 15000
|
|
||||||
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
|
||||||
if os.path.isdir(directory):
|
|
||||||
print("Directory already exists; skip creation")
|
|
||||||
else:
|
|
||||||
os.makedirs(directory, 0o755, False)
|
|
||||||
os.chdir("..")
|
|
||||||
|
|
||||||
# now we start
|
|
||||||
|
|
||||||
|
|
||||||
# user has to set path for pictures
|
|
||||||
print("Please set download location (full path required): ")
|
|
||||||
path = sys.stdin.readline()
|
|
||||||
|
|
||||||
# set tags, if user want to download specific pictures
|
|
||||||
print("Set Tags (seperate multiple tags with a whitespace; " +
|
|
||||||
"connect tags with more than one word with an underscore): ")
|
|
||||||
tags = sys.stdin.readline().strip("\n")
|
|
||||||
|
|
||||||
# ask if they want to use the safe mode or not
|
|
||||||
print("Are you wanna use the safe mode of konachan? [yes/no]")
|
|
||||||
safemode = sys.stdin.readline().strip("\n")
|
|
||||||
|
|
||||||
if safemode == "yes":
|
|
||||||
konachan = "konachan.net"
|
|
||||||
else:
|
|
||||||
konachan = "konachan.com"
|
|
||||||
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
|
|
||||||
# chdir in $path and create directory if it not exists
|
|
||||||
|
|
||||||
dir_tags = tags.replace(":", "_")
|
|
||||||
|
|
||||||
if not os.path.isdir(path.rstrip()):
|
|
||||||
os.makedirs(path.rstrip(), 0o755, True)
|
|
||||||
os.chdir(path.rstrip())
|
|
||||||
if safemode == "yes":
|
|
||||||
if not os.path.isdir("Safemode Tags " + dir_tags):
|
|
||||||
os.makedirs("Safemode Tags " + dir_tags, 0o755, True)
|
|
||||||
os.chdir("Safemode Tags " + dir_tags)
|
|
||||||
else:
|
|
||||||
if not os.path.isdir("Tags " + dir_tags):
|
|
||||||
os.makedirs("Tags " + dir_tags, 0o755, True)
|
|
||||||
os.chdir("Tags " + dir_tags)
|
|
||||||
|
|
||||||
|
|
||||||
# creating directory for pics
|
|
||||||
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
|
||||||
if not os.path.isdir(directory):
|
|
||||||
os.makedirs(directory, 0o755, True)
|
|
||||||
|
|
||||||
# let's start with downloading
|
|
||||||
|
|
||||||
for page_number in range(1, page_count()):
|
|
||||||
|
|
||||||
print("Starting download in page " + str(page_number))
|
|
||||||
|
|
||||||
domain.request("GET", "/post?page=" + str(page_number) +
|
|
||||||
"&tags=" + tags.replace(" ", "+"))
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
index_page = domain.getresponse()
|
|
||||||
break
|
|
||||||
except http.client.BadStatusLine:
|
|
||||||
domain.close()
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
domain.request("GET", "/post?page=" + str(page_number) +
|
|
||||||
"&tags=" + tags.replace(" ", "+"))
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# after we got the response from konachan we need the source code
|
|
||||||
index_page_source = str(index_page.read())
|
|
||||||
|
|
||||||
# and now we need save every link on this page in a list
|
|
||||||
pics_list = index_page_source.split("Post.register")
|
|
||||||
|
|
||||||
directory_size(directory)
|
|
||||||
|
|
||||||
# now we can search every line for the pic link
|
|
||||||
for pic in pics_list:
|
|
||||||
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
|
|
||||||
|
|
||||||
# if we found the url we download the pic
|
|
||||||
# but with whitespaces instead of "%20"
|
|
||||||
if pic_url:
|
|
||||||
name = name_regex.search(pic_url.group(0)).group(1)
|
|
||||||
print(" Downloading pic: " + name.replace("%20", " ") +
|
|
||||||
" in directory: " + directory)
|
|
||||||
|
|
||||||
# a little check if pic already exists
|
|
||||||
existance = False
|
|
||||||
for dir in os.listdir():
|
|
||||||
os.chdir(dir)
|
|
||||||
if os.path.isfile(name.replace("%20", " ")):
|
|
||||||
print(" Pic is already on your pc! Skip!")
|
|
||||||
existance = True
|
|
||||||
os.chdir("..")
|
|
||||||
|
|
||||||
if not existance:
|
|
||||||
os.chdir(directory)
|
|
||||||
image = urllib.request.URLopener()
|
|
||||||
image.retrieve("http://" +
|
|
||||||
pic_url.group(0), urllib.request.url2pathname(name))
|
|
||||||
print(" Download finished")
|
|
||||||
os.chdir("..")
|
|
||||||
152
imagecrawler.go
Executable file
152
imagecrawler.go
Executable file
@@ -0,0 +1,152 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"os/user"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Global vars
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
type picture struct {
|
||||||
|
FileURL string `json:"file_url"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// main function to download pictures
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
// define flags and parse them
|
||||||
|
var path string
|
||||||
|
var safemode bool
|
||||||
|
var tags string
|
||||||
|
|
||||||
|
flag.StringVar(&path, "dir", "unnamed", "Directory to safe pictures. Default is %HOME/pictures/konachan/unnamed")
|
||||||
|
flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false")
|
||||||
|
flag.StringVar(&tags, "tags", "", "Tags used to filter search query.")
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
// set home directory and create it to save pictures in
|
||||||
|
homepath, err := user.Current()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
filepath := strings.Join([]string{homepath.HomeDir, "pictures", "konachan", strings.TrimSuffix(path, "\n")}, "/")
|
||||||
|
os.MkdirAll(filepath, 0700)
|
||||||
|
|
||||||
|
// edit tags array to met API requirement
|
||||||
|
tags = strings.Replace(tags, ",", "+", -1)
|
||||||
|
tags = strings.TrimSuffix(tags, "\n")
|
||||||
|
|
||||||
|
fmt.Println("Starting to crawl :D")
|
||||||
|
|
||||||
|
picHits := 1
|
||||||
|
page := 1
|
||||||
|
|
||||||
|
for picHits > 0 {
|
||||||
|
|
||||||
|
fmt.Println("Page: ", page)
|
||||||
|
|
||||||
|
website := fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s", page, tags)
|
||||||
|
if safemode {
|
||||||
|
website = fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s+rating:safe", page, tags)
|
||||||
|
}
|
||||||
|
|
||||||
|
picList := openConnection(website)
|
||||||
|
pictures := parseMaps(picList)
|
||||||
|
|
||||||
|
picHits = len(pictures)
|
||||||
|
page++
|
||||||
|
|
||||||
|
wg.Add(len(pictures))
|
||||||
|
for _, pic := range pictures {
|
||||||
|
go downloadPic(pic, filepath)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// function to create the connection to konachan and get the API response
|
||||||
|
func openConnection(url string) []picture {
|
||||||
|
var f []picture
|
||||||
|
|
||||||
|
result, err := http.Get(url)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer result.Body.Close()
|
||||||
|
|
||||||
|
data, err := ioutil.ReadAll(result.Body)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = json.Unmarshal(data, &f); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
// function to parse the json response and extract only the file url
|
||||||
|
func parseMaps(f []picture) []string {
|
||||||
|
fileURLs := []string{}
|
||||||
|
for _, pic := range f {
|
||||||
|
fileURL := pic.FileURL
|
||||||
|
fileURLs = append(fileURLs, fileURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fileURLs
|
||||||
|
}
|
||||||
|
|
||||||
|
// function to download and sace the pictures to disk
|
||||||
|
func downloadPic(picURL string, filepath string) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1])
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := os.Stat(filepath + "/" + picName); err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := http.Get(picURL)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer result.Body.Close()
|
||||||
|
|
||||||
|
//fmt.Println(result.Status)
|
||||||
|
|
||||||
|
if result.StatusCode != 200 {
|
||||||
|
wg.Add(1)
|
||||||
|
go downloadPic(picURL, filepath)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
file, err := os.Create(filepath + "/" + picName)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = io.Copy(file, result.Body)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
fmt.Printf("Downloading: %s\n", picName)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user