Compare commits

..

13 Commits

Author SHA1 Message Date
m3philis
9f28c25534 fix debug output 2025-06-01 16:36:53 +02:00
m3philis
27f189f9b9 fix dir permission and remove binary 2025-06-01 16:34:06 +02:00
m3philis
00d25bbe3a change default save location and add go mod 2025-06-01 16:32:48 +02:00
m3philis
693290bc03 replace whitespaces with tabs 2025-04-03 15:23:34 +02:00
m3philis
4700488489 add ration and site parameters 2023-08-21 21:13:36 +02:00
m3philis
0bf831ee54 add line to replace = with : for meta-tags 2023-01-19 02:56:25 +01:00
M3philis
b2fe0d449a remove unneccessary comment 2018-06-06 12:21:20 +02:00
M3philis
b48ec14ab2 rearranged variable position; changed print to funny message 2018-04-04 10:53:49 +02:00
M3philis
d99f04ebeb changed README 2018-04-03 19:29:14 +02:00
M3philis
ea48bbeb15 format README 2018-04-03 19:13:15 +02:00
M3philis
9b677a46d9 add README 2018-04-03 19:11:11 +02:00
M3philis
5faf3f22f5 reimplement crawler in golang. Working state 2018-04-03 18:57:46 +02:00
Marco Pfomann
69ceef6782 fixed wrong permissions on directory creation 2016-05-31 16:09:21 +02:00
4 changed files with 258 additions and 163 deletions

View File

@@ -1,162 +0,0 @@
#! /usr/bin/python
from __future__ import print_function
import re
import os
import os.path
import sys
import http.client
import urllib.request
import time
# regexes
url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)")
name_regex = re.compile("image/.*?/(.*)")
# variable
counter1, counter2 = 0, 15000
tag_filter = None
konachan = None
# little function to calculate the last page of search results
def page_count():
# open connection to konachan.com
domain = http.client.HTTPConnection(konachan)
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
while True:
try:
first_page = domain.getresponse()
break
except http.client.BadStatusLine:
time.sleep(1)
domain.close()
domain = http.client.HTTPConnection(konachan)
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
# we got our response, now it's time to find that number
first_page_source = str(first_page.read())
page_list = first_page_source.split("Next Page")
number = 0
for line in page_list:
if re.search("(?<=\/post\?page\=)\d+", line):
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
else:
number = 2
return int(number)
# we don't want to save every picture in one directory.
# so we create a new directory when we donwloaded 15k pics
def directory_size(directory_intern):
if len(os.listdir(directory_intern)) >= 15000:
print("Directory " + directory_intern + " full")
counter1 += 15000
counter2 += 15000
directory = "Pics " + str(counter1) + " - " + str(counter2)
if os.path.isdir(directory):
print("Directory already exists; skip creation")
else:
os.makedirs(directory, 0o755, False)
os.chdir("..")
# now we start
# user has to set path for pictures
print("Please set download location (full path required): ")
path = sys.stdin.readline()
# set tags, if user want to download specific pictures
print("Set Tags (seperate multiple tags with a whitespace; " +
"connect tags with more than one word with an underscore): ")
tags = sys.stdin.readline().strip("\n")
# ask if they want to use the safe mode or not
print("Are you wanna use the safe mode of konachan? [yes/no]")
safemode = sys.stdin.readline().strip("\n")
if safemode == "yes":
konachan = "konachan.net"
else:
konachan = "konachan.com"
domain = http.client.HTTPConnection(konachan)
# chdir in $path and create directory if it not exists
if not os.path.isdir(path.rstrip()):
os.makedirs(path.rstrip(), 0o755, True)
os.chdir(path.rstrip())
if safemode == "yes":
if not os.path.isdir("Safemode: Tags: " + tags):
os.makedirs("Safemode: Tags: " + tags, 0o7555, True)
os.chdir("Safemode: Tags: " + tags)
else:
if not os.path.isdir("Tags: " + tags):
os.makedirs("Tags: " + tags, 0o755, True)
os.chdir("Tags: " + tags)
# creating directory for pics
directory = "Pics " + str(counter1) + " - " + str(counter2)
if not os.path.isdir(directory):
os.makedirs(directory, 0o755, True)
# let's start with downloading
for page_number in range(1, page_count()):
print("Starting download in page " + str(page_number))
domain.request("GET", "/post?page=" + str(page_number) +
"&tags=" + tags.replace(" ", "+"))
while True:
try:
index_page = domain.getresponse()
break
except http.client.BadStatusLine:
domain.close()
domain = http.client.HTTPConnection(konachan)
domain.request("GET", "/post?page=" + str(page_number) +
"&tags=" + tags.replace(" ", "+"))
time.sleep(1)
# after we got the response from konachan we need the source code
index_page_source = str(index_page.read())
# and now we need save every link on this page in a list
pics_list = index_page_source.split("Post.register")
directory_size(directory)
# now we can search every line for the pic link
for pic in pics_list:
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
# if we found the url we download the pic
# but with whitespaces instead of "%20"
if pic_url:
name = name_regex.search(pic_url.group(0)).group(1)
print(" Downloading pic: " + name.replace("%20", " ") +
" in directory: " + directory)
# a little check if pic already exists
existance = False
for dir in os.listdir():
os.chdir(dir)
if os.path.isfile(name.replace("%20", " ")):
print(" Pic is already on your pc! Skip!")
existance = True
os.chdir("..")
if not existance:
os.chdir(directory)
image = urllib.request.URLopener()
image.retrieve("http://" +
pic_url.group(0), urllib.request.url2pathname(name))
print(" Download finished")
os.chdir("..")

View File

@@ -1,2 +1,21 @@
konachan_crawler imagecrawler for konachan.com
================ ================
## Why this crawler?
This little project was created to easily download pictures of Konachan.
The WebUi of Konachan let you see single pictures, but if you want to download pictures based on tags, you need to right-click every single one of them to save them to disk.
This tool just wants to know the tags, the path to save and if safemode should be used. After that it starts to use the konachan API to download all pictures it finds.
## Usage
Just use `go run` to run the tool without compiling.
`go run imagecrawler.go --dir foo --tags bar,snafu --safe`
### Flags
* `--dir`: Defines the directory where to save pictures. Default is `%HOME/pictures/konachan/unnamed`. The `--dir` option defines the `unnamed` at the end. The `%HOME/pictures/konachan` is hardcoded at the moment.
* `--tags`: A coma seperated list of tags to search for (--tags foo,bar,snafu,height:1080)
* `--safe`: A boolean to enable safemode (Default is off)

3
go.mod Normal file
View File

@@ -0,0 +1,3 @@
module imagecrawler
go 1.24.3

235
imagecrawler.go Executable file
View File

@@ -0,0 +1,235 @@
package main
import (
"encoding/json"
"flag"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"sync"
"time"
)
// Global vars
var wg sync.WaitGroup
type picture struct {
FileURL string `json:"file_url"`
Width int `json:"width"`
Height int `json:"height"`
ImageWidth int `json:"image_width"`
ImageHeight int `json:"image_height"`
FileExt string `json:"file_ext"`
Tags string `json:"tag_string"`
}
// main function to download pictures
func main() {
// define flags and parse them
var path string
var safemode bool
var tags string
var aspect string
var site string
// variables for downloading
picHits := 1
page := 1
ratio := 0.0
flag.StringVar(&path, "dir", "", "Path to safe pictures. Default is '~/pictures/$site/$tags'")
flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false")
flag.StringVar(&tags, "tags", "", "Tags used to filter search query.")
flag.StringVar(&aspect, "aspect", "", "Aspect ratio pics should have")
flag.StringVar(&site, "site", "konachan", "Site to crawl from, either konachan or danbooru")
flag.Parse()
// get the UserHomeDir
homedir, err := os.UserHomeDir()
if err != nil {
log.Fatal(err)
}
// edit tags array to met API requirement
tags = strings.Replace(tags, " ", "_", -1)
tags = strings.Replace(tags, ",", "+", -1)
tags = strings.Replace(tags, "=", ":", -1)
tags = strings.TrimSuffix(tags, "\n")
// set the path based on variable or tags
default_path := strings.Join([]string{homedir, "pictures", site, tags}, "/")
filepath := default_path
if path != "" {
filepath = path
}
os.MkdirAll(filepath, 0700)
// calculate aspect ratio
if isFlagPassed("aspect") {
aspectSlice := strings.Split(aspect, ":")
widthF, _ := strconv.ParseFloat(aspectSlice[0], 64)
heightF, _ := strconv.ParseFloat(aspectSlice[1], 64)
ratio = widthF / heightF
} else {
ratio = 0.0
}
for picHits > 0 {
fmt.Println("Page: ", page)
website := fmt.Sprintf("https://konachan.com/post.json?limit=100&page=%d&tags=%s", page, tags)
if safemode {
website = fmt.Sprintf("https://konachan.com/post.json?limit=100&page=%d&tags=%s+rating:safe", page, tags)
}
if site == "danbooru" {
website = fmt.Sprintf("https://danbooru.donmai.us/posts.json?page=%d&tags=%s", page, tags)
if safemode {
website = fmt.Sprintf("https://danbooru.donmai.us/posts.json?page=%d&tags=%s+rating:safe", page, tags)
}
}
response := requestAPI(website)
if response.StatusCode == 421 {
time.Sleep(1 * time.Second)
}
if response.StatusCode != 200 {
defer response.Body.Close()
continue
}
picList := getJSON(response)
pictures, count := parseMaps(picList, ratio)
picHits = count
page++
wg.Add(len(pictures))
for _, pic := range pictures {
go downloadPic(pic, filepath)
}
wg.Wait()
time.Sleep(1 * time.Second)
}
}
func isFlagPassed(name string) bool {
found := false
flag.Visit(func(f *flag.Flag) {
if f.Name == name {
found = true
}
})
return found
}
// function to create the connection to konachan and get the API response
func requestAPI(url string) *http.Response {
result, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
return result
}
func getJSON(result *http.Response) []picture {
var f []picture
defer result.Body.Close()
data, err := io.ReadAll(result.Body)
if err != nil {
log.Fatal(err)
}
if err = json.Unmarshal(data, &f); err != nil {
panic(err)
}
return f
}
// function to parse the json response and extract only the file url
func parseMaps(f []picture, ratio float64) ([]string, int) {
fileURLs := []string{}
picCount := 0
if isFlagPassed("aspect") {
for _, pic := range f {
picCount++
picWidthF := 1.0
picHeightF := 1.0
if pic.Width != 0 && pic.Height != 0 {
picWidthF = float64(pic.Width)
picHeightF = float64(pic.Height)
} else {
picWidthF = float64(pic.ImageWidth)
picHeightF = float64(pic.ImageHeight)
}
if (picWidthF / picHeightF) == ratio {
fileURL := pic.FileURL
fileURLs = append(fileURLs, fileURL)
}
}
} else {
for _, pic := range f {
picCount++
fileURL := pic.FileURL
fileURLs = append(fileURLs, fileURL)
}
}
return fileURLs, picCount
}
// function to download and sace the pictures to disk
func downloadPic(picURL string, filepath string) {
defer wg.Done()
picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1])
if err != nil {
log.Fatal(err)
}
if _, err := os.Stat(filepath + "/" + picName); err == nil {
return
}
result, err := http.Get(picURL)
if err != nil {
log.Fatal(err)
}
defer result.Body.Close()
//fmt.Println(result.Status)
if result.StatusCode != 200 {
wg.Add(1)
go downloadPic(picURL, filepath)
return
}
file, err := os.Create(filepath + "/" + picName)
if err != nil {
log.Fatal(err)
}
_, err = io.Copy(file, result.Body)
if err != nil {
log.Fatal(err)
}
file.Close()
fmt.Printf("Downloading: %s\n", picName)
}