147 lines
4.6 KiB
Python
Executable File
147 lines
4.6 KiB
Python
Executable File
#! /usr/bin/python
|
|
|
|
from __future__ import print_function
|
|
|
|
import re
|
|
import os
|
|
import os.path
|
|
import sys
|
|
import http.client
|
|
import urllib.request
|
|
import time
|
|
|
|
# regexes
|
|
url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)")
|
|
name_regex = re.compile("image/.*?/(.*)")
|
|
|
|
# variable
|
|
counter1, counter2 = 0, 15000
|
|
tag_filter = None
|
|
domain = http.client.HTTPConnection("konachan.com")
|
|
|
|
# little function to calculate the last page of search results
|
|
|
|
|
|
def page_count():
|
|
# open connection to konachan.com
|
|
domain = http.client.HTTPConnection("konachan.com")
|
|
|
|
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
|
while True:
|
|
try:
|
|
first_page = domain.getresponse()
|
|
break
|
|
except http.client.BadStatusLine:
|
|
time.sleep(1)
|
|
domain.close()
|
|
domain = http.client.HTTPConnection("konachan.com")
|
|
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
|
|
|
# we got our response, now it's time to find that number
|
|
first_page_source = str(first_page.read())
|
|
page_list = first_page_source.split("Next Page")
|
|
number = 0
|
|
for line in page_list:
|
|
if re.search("(?<=\/post\?page\=)\d+", line):
|
|
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
|
|
else:
|
|
number = 2
|
|
return int(number)
|
|
|
|
# we don't want to save every picture in one directory.
|
|
# so we create a new directory when we donwloaded 15k pics
|
|
|
|
|
|
def directory_size(directory_intern):
|
|
if len(os.listdir(directory_intern)) >= 15000:
|
|
print("Directory " + directory_intern + " full")
|
|
counter1 += 15000
|
|
counter2 += 15000
|
|
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
|
if os.path.isdir(directory):
|
|
print("Directory already exists; skip creation")
|
|
else:
|
|
os.makedirs(directory, 0o755, False)
|
|
os.chdir("..")
|
|
|
|
# now we start
|
|
|
|
# user has to set path for pictures
|
|
print("Please set download location (full path required): ")
|
|
path = sys.stdin.readline()
|
|
|
|
# set tags, if user want to download specific pictures
|
|
print("Set Tags (seperate multiple tags with a whitespace;" +
|
|
" connect tags with more than one word with an underscore): ")
|
|
tags = sys.stdin.readline().strip("\n")
|
|
|
|
# chdir in $path and create directory if it not exists
|
|
if not os.path.isdir(path.rstrip()):
|
|
os.makedirs(path.rstrip(), 0o755, True)
|
|
os.chdir(path.rstrip())
|
|
if not os.path.isdir("Tags: " + tags):
|
|
os.makedirs("Tags: " + tags, 0o755, True)
|
|
os.chdir("Tags: " + tags)
|
|
|
|
|
|
# creating directory for pics
|
|
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
|
if not os.path.isdir(directory):
|
|
os.makedirs(directory, 0o755, True)
|
|
|
|
# let's start with downloading
|
|
|
|
for page_number in range(1, page_count()):
|
|
|
|
print("Starting download in page " + str(page_number))
|
|
|
|
domain.request("GET", "/post?page=" + str(page_number) +
|
|
"&tags=" + tags.replace(" ", "+"))
|
|
|
|
while True:
|
|
try:
|
|
index_page = domain.getresponse()
|
|
break
|
|
except http.client.BadStatusLine:
|
|
domain.close()
|
|
domain = http.client.HTTPConnection("konachan.com")
|
|
domain.request("GET", "/post?page=" + str(page_number) +
|
|
"&tags=" + tags.replace(" ", "+"))
|
|
time.sleep(1)
|
|
|
|
# after we got the response from konachan we need the source code
|
|
index_page_source = str(index_page.read())
|
|
|
|
# and now we need save every link on this page in a list
|
|
pics_list = index_page_source.split("Post.register")
|
|
|
|
directory_size(directory)
|
|
|
|
# now we can search every line for the pic link
|
|
for pic in pics_list:
|
|
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
|
|
|
|
# if we found the url we download the pic
|
|
# but with whitespaces instead of "%20"
|
|
if pic_url:
|
|
name = name_regex.search(pic_url.group(0)).group(1)
|
|
print(" Downloading pic: " + name.replace("%20", " ") +
|
|
" in directory: " + directory)
|
|
|
|
# a little check if pic already exists
|
|
existance = False
|
|
for dir in os.listdir():
|
|
os.chdir(dir)
|
|
if os.path.isfile(name.replace("%20", " ")):
|
|
print(" Pic is already on your pc! Skip!")
|
|
existance = True
|
|
os.chdir("..")
|
|
|
|
if not existance:
|
|
os.chdir(directory)
|
|
image = urllib.request.URLopener()
|
|
image.retrieve("http://" +
|
|
pic_url.group(0), urllib.request.url2pathname(name))
|
|
print(" Download finished")
|
|
os.chdir("..")
|