Files
imagecrawler/Python/Konachan-Downloader_v3.py
2014-04-02 13:23:26 +02:00

147 lines
4.6 KiB
Python
Executable File

#! /usr/bin/python
from __future__ import print_function
import re
import os
import os.path
import sys
import http.client
import urllib.request
import time
# regexes
url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)")
name_regex = re.compile("image/.*?/(.*)")
# variable
counter1, counter2 = 0, 15000
tag_filter = None
domain = http.client.HTTPConnection("konachan.com")
# little function to calculate the last page of search results
def page_count():
# open connection to konachan.com
domain = http.client.HTTPConnection("konachan.com")
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
while True:
try:
first_page = domain.getresponse()
break
except http.client.BadStatusLine:
time.sleep(1)
domain.close()
domain = http.client.HTTPConnection("konachan.com")
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
# we got our response, now it's time to find that number
first_page_source = str(first_page.read())
page_list = first_page_source.split("Next Page")
number = 0
for line in page_list:
if re.search("(?<=\/post\?page\=)\d+", line):
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
else:
number = 2
return int(number)
# we don't want to save every picture in one directory.
# so we create a new directory when we donwloaded 15k pics
def directory_size(directory_intern):
if len(os.listdir(directory_intern)) >= 15000:
print("Directory " + directory_intern + " full")
counter1 += 15000
counter2 += 15000
directory = "Pics " + str(counter1) + " - " + str(counter2)
if os.path.isdir(directory):
print("Directory already exists; skip creation")
else:
os.makedirs(directory, 0o755, False)
os.chdir("..")
# now we start
# user has to set path for pictures
print("Please set download location (full path required): ")
path = sys.stdin.readline()
# set tags, if user want to download specific pictures
print("Set Tags (seperate multiple tags with a whitespace;" +
" connect tags with more than one word with an underscore): ")
tags = sys.stdin.readline().strip("\n")
# chdir in $path and create directory if it not exists
if not os.path.isdir(path.rstrip()):
os.makedirs(path.rstrip(), 0o755, True)
os.chdir(path.rstrip())
if not os.path.isdir("Tags: " + tags):
os.makedirs("Tags: " + tags, 0o755, True)
os.chdir("Tags: " + tags)
# creating directory for pics
directory = "Pics " + str(counter1) + " - " + str(counter2)
if not os.path.isdir(directory):
os.makedirs(directory, 0o755, True)
# let's start with downloading
for page_number in range(1, page_count()):
print("Starting download in page " + str(page_number))
domain.request("GET", "/post?page=" + str(page_number) +
"&tags=" + tags.replace(" ", "+"))
while True:
try:
index_page = domain.getresponse()
break
except http.client.BadStatusLine:
domain.close()
domain = http.client.HTTPConnection("konachan.com")
domain.request("GET", "/post?page=" + str(page_number) +
"&tags=" + tags.replace(" ", "+"))
time.sleep(1)
# after we got the response from konachan we need the source code
index_page_source = str(index_page.read())
# and now we need save every link on this page in a list
pics_list = index_page_source.split("Post.register")
directory_size(directory)
# now we can search every line for the pic link
for pic in pics_list:
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
# if we found the url we download the pic
# but with whitespaces instead of "%20"
if pic_url:
name = name_regex.search(pic_url.group(0)).group(1)
print(" Downloading pic: " + name.replace("%20", " ") +
" in directory: " + directory)
# a little check if pic already exists
existance = False
for dir in os.listdir():
os.chdir(dir)
if os.path.isfile(name.replace("%20", " ")):
print(" Pic is already on your pc! Skip!")
existance = True
os.chdir("..")
if not existance:
os.chdir(directory)
image = urllib.request.URLopener()
image.retrieve("http://" +
pic_url.group(0), urllib.request.url2pathname(name))
print(" Download finished")
os.chdir("..")