From 86625e3031d23b9ac761b2f853bb51fe59639891 Mon Sep 17 00:00:00 2001 From: Marco Pfomann <1pfomann@informatik.uni-hamburg.de> Date: Wed, 2 Apr 2014 13:23:26 +0200 Subject: [PATCH] previously coded crawler in python --- Python/Konachan-Downloader_v3.py | 146 +++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100755 Python/Konachan-Downloader_v3.py diff --git a/Python/Konachan-Downloader_v3.py b/Python/Konachan-Downloader_v3.py new file mode 100755 index 0000000..0c1433e --- /dev/null +++ b/Python/Konachan-Downloader_v3.py @@ -0,0 +1,146 @@ +#! /usr/bin/python + +from __future__ import print_function + +import re +import os +import os.path +import sys +import http.client +import urllib.request +import time + +# regexes +url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)") +name_regex = re.compile("image/.*?/(.*)") + +# variable +counter1, counter2 = 0, 15000 +tag_filter = None +domain = http.client.HTTPConnection("konachan.com") + +# little function to calculate the last page of search results + + +def page_count(): + # open connection to konachan.com + domain = http.client.HTTPConnection("konachan.com") + + domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+")) + while True: + try: + first_page = domain.getresponse() + break + except http.client.BadStatusLine: + time.sleep(1) + domain.close() + domain = http.client.HTTPConnection("konachan.com") + domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+")) + + # we got our response, now it's time to find that number + first_page_source = str(first_page.read()) + page_list = first_page_source.split("Next Page") + number = 0 + for line in page_list: + if re.search("(?<=\/post\?page\=)\d+", line): + number = re.search("(?<=\/post\?page\=)\d+", line).group(0) + else: + number = 2 + return int(number) + +# we don't want to save every picture in one directory. +# so we create a new directory when we donwloaded 15k pics + + +def directory_size(directory_intern): + if len(os.listdir(directory_intern)) >= 15000: + print("Directory " + directory_intern + " full") + counter1 += 15000 + counter2 += 15000 + directory = "Pics " + str(counter1) + " - " + str(counter2) + if os.path.isdir(directory): + print("Directory already exists; skip creation") + else: + os.makedirs(directory, 0o755, False) + os.chdir("..") + +# now we start + +# user has to set path for pictures +print("Please set download location (full path required): ") +path = sys.stdin.readline() + +# set tags, if user want to download specific pictures +print("Set Tags (seperate multiple tags with a whitespace;" + + " connect tags with more than one word with an underscore): ") +tags = sys.stdin.readline().strip("\n") + +# chdir in $path and create directory if it not exists +if not os.path.isdir(path.rstrip()): + os.makedirs(path.rstrip(), 0o755, True) +os.chdir(path.rstrip()) +if not os.path.isdir("Tags: " + tags): + os.makedirs("Tags: " + tags, 0o755, True) +os.chdir("Tags: " + tags) + + +# creating directory for pics +directory = "Pics " + str(counter1) + " - " + str(counter2) +if not os.path.isdir(directory): + os.makedirs(directory, 0o755, True) + +# let's start with downloading + +for page_number in range(1, page_count()): + + print("Starting download in page " + str(page_number)) + + domain.request("GET", "/post?page=" + str(page_number) + + "&tags=" + tags.replace(" ", "+")) + + while True: + try: + index_page = domain.getresponse() + break + except http.client.BadStatusLine: + domain.close() + domain = http.client.HTTPConnection("konachan.com") + domain.request("GET", "/post?page=" + str(page_number) + + "&tags=" + tags.replace(" ", "+")) + time.sleep(1) + + # after we got the response from konachan we need the source code + index_page_source = str(index_page.read()) + + # and now we need save every link on this page in a list + pics_list = index_page_source.split("Post.register") + + directory_size(directory) + + # now we can search every line for the pic link + for pic in pics_list: + pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic)) + + # if we found the url we download the pic + # but with whitespaces instead of "%20" + if pic_url: + name = name_regex.search(pic_url.group(0)).group(1) + print(" Downloading pic: " + name.replace("%20", " ") + + " in directory: " + directory) + + # a little check if pic already exists + existance = False + for dir in os.listdir(): + os.chdir(dir) + if os.path.isfile(name.replace("%20", " ")): + print(" Pic is already on your pc! Skip!") + existance = True + os.chdir("..") + + if not existance: + os.chdir(directory) + image = urllib.request.URLopener() + image.retrieve("http://" + + pic_url.group(0), urllib.request.url2pathname(name)) + print(" Download finished") + os.chdir("..")