replace whitespaces with tabs

add ration and site parameters
add line to replace = with : for meta-tags
2025-04-03 15:23:34 +02:00 · 2023-08-21 21:13:36 +02:00 · 2023-01-19 02:56:25 +01:00 · 2018-06-06 12:21:20 +02:00 · 2018-04-04 10:53:49 +02:00 · 2018-04-03 19:29:14 +02:00
3 changed files with 236 additions and 163 deletions
--- a/Python/Konachan-Downloader_v3.py
+++ b/Python/Konachan-Downloader_v3.py
@@ -1,162 +0,0 @@
 #! /usr/bin/python
 from __future__ import print_function
 import re
 import os
 import os.path
 import sys
 import http.client
 import urllib.request
 import time
 # regexes
 url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)")
 name_regex = re.compile("image/.*?/(.*)")
 # variable
 counter1, counter2 = 0, 15000
 tag_filter = None
 konachan = None
 # little function to calculate the last page of search results
 def page_count():
    # open connection to konachan.com
    domain = http.client.HTTPConnection(konachan)
    domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
    while True:
        try:
            first_page = domain.getresponse()
            break
        except http.client.BadStatusLine:
            time.sleep(1)
            domain.close()
            domain = http.client.HTTPConnection(konachan)
            domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
    # we got our response, now it's time to find that number
    first_page_source = str(first_page.read())
    page_list = first_page_source.split("Next Page")
    number = 0
    for line in page_list:
        if re.search("(?<=\/post\?page\=)\d+", line):
            number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
        else:
            number = 2
    return int(number)
 # we don't want to save every picture in one directory.
 # so we create a new directory when we donwloaded 15k pics
 def directory_size(directory_intern):
    if len(os.listdir(directory_intern)) >= 15000:
        print("Directory " + directory_intern + " full")
        counter1 += 15000
        counter2 += 15000
        directory = "Pics " + str(counter1) + " - " + str(counter2)
        if os.path.isdir(directory):
            print("Directory already exists; skip creation")
        else:
            os.makedirs(directory, 0o755, False)
        os.chdir("..")
 # now we start
 # user has to set path for pictures
 print("Please set download location (full path required): ")
 path = sys.stdin.readline()
 # set tags, if user want to download specific pictures
 print("Set Tags (seperate multiple tags with a whitespace; " +
    "connect tags with more than one word with an underscore): ")
 tags = sys.stdin.readline().strip("\n")
 # ask if they want to use the safe mode or not
 print("Are you wanna use the safe mode of konachan? [yes/no]")
 safemode = sys.stdin.readline().strip("\n")
 if safemode == "yes":
    konachan = "konachan.net"
 else:
    konachan = "konachan.com"
 domain = http.client.HTTPConnection(konachan)
 # chdir in $path and create directory if it not exists
 if not os.path.isdir(path.rstrip()):
    os.makedirs(path.rstrip(), 0o755, True)
 os.chdir(path.rstrip())
 if safemode == "yes":
    if not os.path.isdir("Safemode: Tags: " + tags):
        os.makedirs("Safemode: Tags: " + tags, 0o7555, True)
    os.chdir("Safemode: Tags: " + tags)
 else:
    if not os.path.isdir("Tags: " + tags):
        os.makedirs("Tags: " + tags, 0o755, True)
    os.chdir("Tags: " + tags)
 # creating directory for pics
 directory = "Pics " + str(counter1) + " - " + str(counter2)
 if not os.path.isdir(directory):
    os.makedirs(directory, 0o755, True)
 # let's start with downloading
 for page_number in range(1, page_count()):
    print("Starting download in page " + str(page_number))
    domain.request("GET", "/post?page=" + str(page_number) +
    "&tags=" + tags.replace(" ", "+"))
    while True:
        try:
            index_page = domain.getresponse()
            break
        except http.client.BadStatusLine:
            domain.close()
            domain = http.client.HTTPConnection(konachan)
            domain.request("GET", "/post?page=" + str(page_number) +
             "&tags=" + tags.replace(" ", "+"))
            time.sleep(1)
    # after we got the response from konachan we need the source code
    index_page_source = str(index_page.read())
    # and now we need save every link on this page in a list
    pics_list = index_page_source.split("Post.register")
    directory_size(directory)
    # now we can search every line for the pic link
    for pic in pics_list:
        pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
        # if we found the url we download the pic
        # but with whitespaces instead of "%20"
        if pic_url:
            name = name_regex.search(pic_url.group(0)).group(1)
            print("     Downloading pic:  " + name.replace("%20", " ") +
            " in directory: " + directory)
            # a little check if pic already exists
            existance = False
            for dir in os.listdir():
                os.chdir(dir)
                if os.path.isfile(name.replace("%20", " ")):
                    print("     Pic is already on your pc! Skip!")
                    existance = True
                os.chdir("..")
            if not existance:
                os.chdir(directory)
                image = urllib.request.URLopener()
                image.retrieve("http://" +
                    pic_url.group(0), urllib.request.url2pathname(name))
                print("     Download finished")
                os.chdir("..")
--- a/README.md
+++ b/README.md
@@ -1,2 +1,21 @@
-konachan_crawler
+imagecrawler for konachan.com
 ================
 ## Why this crawler?
 This little project was created to easily download pictures of Konachan.
 The WebUi of Konachan let you see single pictures, but if you want to download pictures based on tags, you need to right-click every single one of them to save them to disk.
 This tool just wants to know the tags, the path to save and if safemode should be used. After that it starts to use the konachan API to download all pictures it finds.
 ## Usage
 Just use `go run` to run the tool without compiling.
 `go run imagecrawler.go --dir foo --tags bar,snafu --safe`
 ### Flags
 * `--dir`: Defines the directory where to save pictures. Default is `%HOME/pictures/konachan/unnamed`. The `--dir` option defines the `unnamed` at the end. The `%HOME/pictures/konachan` is hardcoded at the moment.
 * `--tags`: A coma seperated list of tags to search for (--tags foo,bar,snafu,height:1080)
 * `--safe`: A boolean to enable safemode (Default is off)
--- a/imagecrawler.go
+++ b/imagecrawler.go
@@ -0,0 +1,216 @@
 package main
 import (
 	"encoding/json"
 	"flag"
 	"fmt"
 	"io"
 	"log"
 	"net/http"
 	"net/url"
 	"os"
 	"os/user"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 )
 // Global vars
 var wg sync.WaitGroup
 type picture struct {
 	FileURL     string `json:"file_url"`
 	Width       int    `json:"width"`
 	Height      int    `json:"height"`
 	ImageWidth  int    `json:"image_width"`
 	ImageHeight int    `json:"image_height"`
 	FileExt     string `json:"file_ext"`
 	Tags        string `json:"tag_string"`
 }
 // main function to download pictures
 func main() {
 	// define flags and parse them
 	var path string
 	var safemode bool
 	var tags string
 	var aspect string
 	var site string
 	// variables for downloading
 	picHits := 1
 	page := 1
 	ratio := 0.0
 	flag.StringVar(&path, "dir", "unnamed", "Directory to safe pictures. Default is %HOME/pictures/konachan/unnamed")
 	flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false")
 	flag.StringVar(&tags, "tags", "", "Tags used to filter search query.")
 	flag.StringVar(&aspect, "aspect", "", "Aspect ratio pics should have")
 	flag.StringVar(&site, "site", "konachan", "Site to crawl from, either konachan or danbooru")
 	flag.Parse()
 	// set home directory and create it to save pictures in
 	homepath, err := user.Current()
 	if err != nil {
 		log.Fatal(err)
 	}
 	filepath := strings.Join([]string{homepath.HomeDir, "pictures", site, strings.TrimSuffix(path, "\n")}, "/")
 	os.MkdirAll(filepath, 0700)
 	// edit tags array to met API requirement
 	tags = strings.Replace(tags, ",", "+", -1)
 	tags = strings.Replace(tags, "=", ":", -1)
 	tags = strings.TrimSuffix(tags, "\n")
 	// calculate aspect ratio
 	if isFlagPassed("aspect") {
 		aspectSlice := strings.Split(aspect, ":")
 		widthF, _ := strconv.ParseFloat(aspectSlice[0], 64)
 		heightF, _ := strconv.ParseFloat(aspectSlice[1], 64)
 		ratio = widthF / heightF
 	} else {
 		ratio = 0.0
 	}
 	for picHits > 0 {
 		fmt.Println("Page: ", page)
 		website := fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s", page, tags)
 		if safemode {
 			website = fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s+rating:safe", page, tags)
 		}
 		if site == "danbooru" {
 			website = fmt.Sprintf("https://danbooru.donmai.us/posts.json?page=%d&tags=%s", page, tags)
 			if safemode {
 				website = fmt.Sprintf("https://danbooru.donmai.us/posts.json?page=%d&tags=%s+rating:safe", page, tags)
 			}
 		}
 		picList := openConnection(website)
 		pictures, count := parseMaps(picList, ratio)
 		picHits = count
 		page++
 		wg.Add(len(pictures))
 		for _, pic := range pictures {
 			go downloadPic(pic, filepath)
 		}
 		wg.Wait()
 		time.Sleep(1 * time.Second)
 	}
 }
 func isFlagPassed(name string) bool {
 	found := false
 	flag.Visit(func(f *flag.Flag) {
 		if f.Name == name {
 			found = true
 		}
 	})
 	return found
 }
 // function to create the connection to konachan and get the API response
 func openConnection(url string) []picture {
 	var f []picture
 	result, err := http.Get(url)
 	if err != nil {
 		log.Fatal(err)
 	}
 	defer result.Body.Close()
 	data, err := io.ReadAll(result.Body)
 	if err != nil {
 		log.Fatal(err)
 	}
 	if err = json.Unmarshal(data, &f); err != nil {
 		panic(err)
 	}
 	return f
 }
 // function to parse the json response and extract only the file url
 func parseMaps(f []picture, ratio float64) ([]string, int) {
 	fileURLs := []string{}
 	picCount := 0
 	if isFlagPassed("aspect") {
 		for _, pic := range f {
 			picCount++
 			picWidthF := 1.0
 			picHeightF := 1.0
 			if pic.Width != 0 && pic.Height != 0 {
 				picWidthF = float64(pic.Width)
 				picHeightF = float64(pic.Height)
 			} else {
 				picWidthF = float64(pic.ImageWidth)
 				picHeightF = float64(pic.ImageHeight)
 			}
 			if (picWidthF / picHeightF) == ratio {
 				fileURL := pic.FileURL
 				fileURLs = append(fileURLs, fileURL)
 			}
 		}
 	} else {
 		for _, pic := range f {
 			picCount++
 			fileURL := pic.FileURL
 			fileURLs = append(fileURLs, fileURL)
 		}
 	}
 	return fileURLs, picCount
 }
 // function to download and sace the pictures to disk
 func downloadPic(picURL string, filepath string) {
 	defer wg.Done()
 	picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1])
 	if err != nil {
 		log.Fatal(err)
 	}
 	if _, err := os.Stat(filepath + "/" + picName); err == nil {
 		return
 	}
 	result, err := http.Get(picURL)
 	if err != nil {
 		log.Fatal(err)
 	}
 	defer result.Body.Close()
 	//fmt.Println(result.Status)
 	if result.StatusCode != 200 {
 		wg.Add(1)
 		go downloadPic(picURL, filepath)
 		return
 	}
 	file, err := os.Create(filepath + "/" + picName)
 	if err != nil {
 		log.Fatal(err)
 	}
 	_, err = io.Copy(file, result.Body)
 	if err != nil {
 		log.Fatal(err)
 	}
 	file.Close()
 	fmt.Printf("Downloading: %s\n", picName)
 }
Author	SHA1	Message	Date
m3philis	693290bc03	replace whitespaces with tabs	2025-04-03 15:23:34 +02:00
m3philis	4700488489	add ration and site parameters	2023-08-21 21:13:36 +02:00
m3philis	0bf831ee54	add line to replace = with : for meta-tags	2023-01-19 02:56:25 +01:00
M3philis	b2fe0d449a	remove unneccessary comment	2018-06-06 12:21:20 +02:00
M3philis	b48ec14ab2	rearranged variable position; changed print to funny message	2018-04-04 10:53:49 +02:00
M3philis	d99f04ebeb	changed README	2018-04-03 19:29:14 +02:00
M3philis	ea48bbeb15	format README	2018-04-03 19:13:15 +02:00
M3philis	9b677a46d9	add README	2018-04-03 19:11:11 +02:00
M3philis	5faf3f22f5	reimplement crawler in golang. Working state	2018-04-03 18:57:46 +02:00
Marco Pfomann	69ceef6782	fixed wrong permissions on directory creation	2016-05-31 16:09:21 +02:00