Compare commits
10 Commits
ee8483f83e
...
693290bc03
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
693290bc03 | ||
|
|
4700488489 | ||
|
|
0bf831ee54 | ||
|
|
b2fe0d449a | ||
|
|
b48ec14ab2 | ||
|
|
d99f04ebeb | ||
|
|
ea48bbeb15 | ||
|
|
9b677a46d9 | ||
|
|
5faf3f22f5 | ||
|
|
69ceef6782 |
@@ -1,162 +0,0 @@
|
|||||||
#! /usr/bin/python
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import os.path
|
|
||||||
import sys
|
|
||||||
import http.client
|
|
||||||
import urllib.request
|
|
||||||
import time
|
|
||||||
|
|
||||||
# regexes
|
|
||||||
url_regex = re.compile("konachan.(?:com|net)/image/.+?/.+?\.(?:png|jpg)")
|
|
||||||
name_regex = re.compile("image/.*?/(.*)")
|
|
||||||
|
|
||||||
# variable
|
|
||||||
counter1, counter2 = 0, 15000
|
|
||||||
tag_filter = None
|
|
||||||
konachan = None
|
|
||||||
|
|
||||||
|
|
||||||
# little function to calculate the last page of search results
|
|
||||||
def page_count():
|
|
||||||
# open connection to konachan.com
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
|
|
||||||
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
first_page = domain.getresponse()
|
|
||||||
break
|
|
||||||
except http.client.BadStatusLine:
|
|
||||||
time.sleep(1)
|
|
||||||
domain.close()
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
|
||||||
|
|
||||||
# we got our response, now it's time to find that number
|
|
||||||
first_page_source = str(first_page.read())
|
|
||||||
page_list = first_page_source.split("Next Page")
|
|
||||||
number = 0
|
|
||||||
for line in page_list:
|
|
||||||
if re.search("(?<=\/post\?page\=)\d+", line):
|
|
||||||
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
|
|
||||||
else:
|
|
||||||
number = 2
|
|
||||||
return int(number)
|
|
||||||
|
|
||||||
# we don't want to save every picture in one directory.
|
|
||||||
# so we create a new directory when we donwloaded 15k pics
|
|
||||||
|
|
||||||
|
|
||||||
def directory_size(directory_intern):
|
|
||||||
if len(os.listdir(directory_intern)) >= 15000:
|
|
||||||
print("Directory " + directory_intern + " full")
|
|
||||||
counter1 += 15000
|
|
||||||
counter2 += 15000
|
|
||||||
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
|
||||||
if os.path.isdir(directory):
|
|
||||||
print("Directory already exists; skip creation")
|
|
||||||
else:
|
|
||||||
os.makedirs(directory, 0o755, False)
|
|
||||||
os.chdir("..")
|
|
||||||
|
|
||||||
# now we start
|
|
||||||
|
|
||||||
|
|
||||||
# user has to set path for pictures
|
|
||||||
print("Please set download location (full path required): ")
|
|
||||||
path = sys.stdin.readline()
|
|
||||||
|
|
||||||
# set tags, if user want to download specific pictures
|
|
||||||
print("Set Tags (seperate multiple tags with a whitespace; " +
|
|
||||||
"connect tags with more than one word with an underscore): ")
|
|
||||||
tags = sys.stdin.readline().strip("\n")
|
|
||||||
|
|
||||||
# ask if they want to use the safe mode or not
|
|
||||||
print("Are you wanna use the safe mode of konachan? [yes/no]")
|
|
||||||
safemode = sys.stdin.readline().strip("\n")
|
|
||||||
|
|
||||||
if safemode == "yes":
|
|
||||||
konachan = "konachan.net"
|
|
||||||
else:
|
|
||||||
konachan = "konachan.com"
|
|
||||||
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
|
|
||||||
# chdir in $path and create directory if it not exists
|
|
||||||
if not os.path.isdir(path.rstrip()):
|
|
||||||
os.makedirs(path.rstrip(), 0o755, True)
|
|
||||||
os.chdir(path.rstrip())
|
|
||||||
if safemode == "yes":
|
|
||||||
if not os.path.isdir("Safemode: Tags: " + tags):
|
|
||||||
os.makedirs("Safemode: Tags: " + tags, 0o7555, True)
|
|
||||||
os.chdir("Safemode: Tags: " + tags)
|
|
||||||
else:
|
|
||||||
if not os.path.isdir("Tags: " + tags):
|
|
||||||
os.makedirs("Tags: " + tags, 0o755, True)
|
|
||||||
os.chdir("Tags: " + tags)
|
|
||||||
|
|
||||||
|
|
||||||
# creating directory for pics
|
|
||||||
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
|
||||||
if not os.path.isdir(directory):
|
|
||||||
os.makedirs(directory, 0o755, True)
|
|
||||||
|
|
||||||
# let's start with downloading
|
|
||||||
|
|
||||||
for page_number in range(1, page_count()):
|
|
||||||
|
|
||||||
print("Starting download in page " + str(page_number))
|
|
||||||
|
|
||||||
domain.request("GET", "/post?page=" + str(page_number) +
|
|
||||||
"&tags=" + tags.replace(" ", "+"))
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
index_page = domain.getresponse()
|
|
||||||
break
|
|
||||||
except http.client.BadStatusLine:
|
|
||||||
domain.close()
|
|
||||||
domain = http.client.HTTPConnection(konachan)
|
|
||||||
domain.request("GET", "/post?page=" + str(page_number) +
|
|
||||||
"&tags=" + tags.replace(" ", "+"))
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# after we got the response from konachan we need the source code
|
|
||||||
index_page_source = str(index_page.read())
|
|
||||||
|
|
||||||
# and now we need save every link on this page in a list
|
|
||||||
pics_list = index_page_source.split("Post.register")
|
|
||||||
|
|
||||||
directory_size(directory)
|
|
||||||
|
|
||||||
# now we can search every line for the pic link
|
|
||||||
for pic in pics_list:
|
|
||||||
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
|
|
||||||
|
|
||||||
# if we found the url we download the pic
|
|
||||||
# but with whitespaces instead of "%20"
|
|
||||||
if pic_url:
|
|
||||||
name = name_regex.search(pic_url.group(0)).group(1)
|
|
||||||
print(" Downloading pic: " + name.replace("%20", " ") +
|
|
||||||
" in directory: " + directory)
|
|
||||||
|
|
||||||
# a little check if pic already exists
|
|
||||||
existance = False
|
|
||||||
for dir in os.listdir():
|
|
||||||
os.chdir(dir)
|
|
||||||
if os.path.isfile(name.replace("%20", " ")):
|
|
||||||
print(" Pic is already on your pc! Skip!")
|
|
||||||
existance = True
|
|
||||||
os.chdir("..")
|
|
||||||
|
|
||||||
if not existance:
|
|
||||||
os.chdir(directory)
|
|
||||||
image = urllib.request.URLopener()
|
|
||||||
image.retrieve("http://" +
|
|
||||||
pic_url.group(0), urllib.request.url2pathname(name))
|
|
||||||
print(" Download finished")
|
|
||||||
os.chdir("..")
|
|
||||||
21
README.md
21
README.md
@@ -1,2 +1,21 @@
|
|||||||
konachan_crawler
|
imagecrawler for konachan.com
|
||||||
================
|
================
|
||||||
|
|
||||||
|
## Why this crawler?
|
||||||
|
|
||||||
|
This little project was created to easily download pictures of Konachan.
|
||||||
|
The WebUi of Konachan let you see single pictures, but if you want to download pictures based on tags, you need to right-click every single one of them to save them to disk.
|
||||||
|
This tool just wants to know the tags, the path to save and if safemode should be used. After that it starts to use the konachan API to download all pictures it finds.
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Just use `go run` to run the tool without compiling.
|
||||||
|
`go run imagecrawler.go --dir foo --tags bar,snafu --safe`
|
||||||
|
|
||||||
|
|
||||||
|
### Flags
|
||||||
|
|
||||||
|
* `--dir`: Defines the directory where to save pictures. Default is `%HOME/pictures/konachan/unnamed`. The `--dir` option defines the `unnamed` at the end. The `%HOME/pictures/konachan` is hardcoded at the moment.
|
||||||
|
* `--tags`: A coma seperated list of tags to search for (--tags foo,bar,snafu,height:1080)
|
||||||
|
* `--safe`: A boolean to enable safemode (Default is off)
|
||||||
|
|||||||
216
imagecrawler.go
Executable file
216
imagecrawler.go
Executable file
@@ -0,0 +1,216 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"os/user"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Global vars
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
type picture struct {
|
||||||
|
FileURL string `json:"file_url"`
|
||||||
|
Width int `json:"width"`
|
||||||
|
Height int `json:"height"`
|
||||||
|
ImageWidth int `json:"image_width"`
|
||||||
|
ImageHeight int `json:"image_height"`
|
||||||
|
FileExt string `json:"file_ext"`
|
||||||
|
Tags string `json:"tag_string"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// main function to download pictures
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
// define flags and parse them
|
||||||
|
var path string
|
||||||
|
var safemode bool
|
||||||
|
var tags string
|
||||||
|
var aspect string
|
||||||
|
var site string
|
||||||
|
|
||||||
|
// variables for downloading
|
||||||
|
picHits := 1
|
||||||
|
page := 1
|
||||||
|
ratio := 0.0
|
||||||
|
|
||||||
|
flag.StringVar(&path, "dir", "unnamed", "Directory to safe pictures. Default is %HOME/pictures/konachan/unnamed")
|
||||||
|
flag.BoolVar(&safemode, "safe", false, "Safemode to filter NSFW pictures. Default is false")
|
||||||
|
flag.StringVar(&tags, "tags", "", "Tags used to filter search query.")
|
||||||
|
flag.StringVar(&aspect, "aspect", "", "Aspect ratio pics should have")
|
||||||
|
flag.StringVar(&site, "site", "konachan", "Site to crawl from, either konachan or danbooru")
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
// set home directory and create it to save pictures in
|
||||||
|
homepath, err := user.Current()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
filepath := strings.Join([]string{homepath.HomeDir, "pictures", site, strings.TrimSuffix(path, "\n")}, "/")
|
||||||
|
os.MkdirAll(filepath, 0700)
|
||||||
|
|
||||||
|
// edit tags array to met API requirement
|
||||||
|
tags = strings.Replace(tags, ",", "+", -1)
|
||||||
|
tags = strings.Replace(tags, "=", ":", -1)
|
||||||
|
tags = strings.TrimSuffix(tags, "\n")
|
||||||
|
|
||||||
|
// calculate aspect ratio
|
||||||
|
if isFlagPassed("aspect") {
|
||||||
|
aspectSlice := strings.Split(aspect, ":")
|
||||||
|
widthF, _ := strconv.ParseFloat(aspectSlice[0], 64)
|
||||||
|
heightF, _ := strconv.ParseFloat(aspectSlice[1], 64)
|
||||||
|
ratio = widthF / heightF
|
||||||
|
} else {
|
||||||
|
ratio = 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
for picHits > 0 {
|
||||||
|
|
||||||
|
fmt.Println("Page: ", page)
|
||||||
|
|
||||||
|
website := fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s", page, tags)
|
||||||
|
if safemode {
|
||||||
|
website = fmt.Sprintf("https://konachan.com/post.json?page=%d&tags=%s+rating:safe", page, tags)
|
||||||
|
}
|
||||||
|
|
||||||
|
if site == "danbooru" {
|
||||||
|
website = fmt.Sprintf("https://danbooru.donmai.us/posts.json?page=%d&tags=%s", page, tags)
|
||||||
|
if safemode {
|
||||||
|
website = fmt.Sprintf("https://danbooru.donmai.us/posts.json?page=%d&tags=%s+rating:safe", page, tags)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
picList := openConnection(website)
|
||||||
|
pictures, count := parseMaps(picList, ratio)
|
||||||
|
|
||||||
|
picHits = count
|
||||||
|
page++
|
||||||
|
|
||||||
|
wg.Add(len(pictures))
|
||||||
|
for _, pic := range pictures {
|
||||||
|
go downloadPic(pic, filepath)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isFlagPassed(name string) bool {
|
||||||
|
found := false
|
||||||
|
flag.Visit(func(f *flag.Flag) {
|
||||||
|
if f.Name == name {
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return found
|
||||||
|
}
|
||||||
|
|
||||||
|
// function to create the connection to konachan and get the API response
|
||||||
|
func openConnection(url string) []picture {
|
||||||
|
var f []picture
|
||||||
|
|
||||||
|
result, err := http.Get(url)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer result.Body.Close()
|
||||||
|
|
||||||
|
data, err := io.ReadAll(result.Body)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = json.Unmarshal(data, &f); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
// function to parse the json response and extract only the file url
|
||||||
|
func parseMaps(f []picture, ratio float64) ([]string, int) {
|
||||||
|
fileURLs := []string{}
|
||||||
|
picCount := 0
|
||||||
|
|
||||||
|
if isFlagPassed("aspect") {
|
||||||
|
for _, pic := range f {
|
||||||
|
picCount++
|
||||||
|
|
||||||
|
picWidthF := 1.0
|
||||||
|
picHeightF := 1.0
|
||||||
|
|
||||||
|
if pic.Width != 0 && pic.Height != 0 {
|
||||||
|
picWidthF = float64(pic.Width)
|
||||||
|
picHeightF = float64(pic.Height)
|
||||||
|
} else {
|
||||||
|
picWidthF = float64(pic.ImageWidth)
|
||||||
|
picHeightF = float64(pic.ImageHeight)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (picWidthF / picHeightF) == ratio {
|
||||||
|
fileURL := pic.FileURL
|
||||||
|
fileURLs = append(fileURLs, fileURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for _, pic := range f {
|
||||||
|
picCount++
|
||||||
|
fileURL := pic.FileURL
|
||||||
|
fileURLs = append(fileURLs, fileURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fileURLs, picCount
|
||||||
|
}
|
||||||
|
|
||||||
|
// function to download and sace the pictures to disk
|
||||||
|
func downloadPic(picURL string, filepath string) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
picName, err := url.PathUnescape(strings.Split(picURL, "/")[len(strings.Split(picURL, "/"))-1])
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := os.Stat(filepath + "/" + picName); err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := http.Get(picURL)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer result.Body.Close()
|
||||||
|
|
||||||
|
//fmt.Println(result.Status)
|
||||||
|
|
||||||
|
if result.StatusCode != 200 {
|
||||||
|
wg.Add(1)
|
||||||
|
go downloadPic(picURL, filepath)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
file, err := os.Create(filepath + "/" + picName)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = io.Copy(file, result.Body)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
fmt.Printf("Downloading: %s\n", picName)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user