imagecrawler/Python/Konachan-Downloader_v3.py

#! /usr/bin/python

from __future__ import print_function

import re
import os
import os.path
import sys
import http.client
import urllib.request
import time

# regexes
url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)")
name_regex = re.compile("image/.*?/(.*)")

# variable
counter1, counter2 = 0, 15000
tag_filter = None
domain = http.client.HTTPConnection("konachan.com")

# little function to calculate the last page of search results


def page_count():
    # open connection to konachan.com
    domain = http.client.HTTPConnection("konachan.com")

    domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
    while True:
        try:
            first_page = domain.getresponse()
            break
        except http.client.BadStatusLine:
            time.sleep(1)
            domain.close()
            domain = http.client.HTTPConnection("konachan.com")
            domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))

    # we got our response, now it's time to find that number
    first_page_source = str(first_page.read())
    page_list = first_page_source.split("Next Page")
    number = 0
    for line in page_list:
        if re.search("(?<=\/post\?page\=)\d+", line):
            number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
        else:
            number = 2
    return int(number)

# we don't want to save every picture in one directory.
# so we create a new directory when we donwloaded 15k pics


def directory_size(directory_intern):
    if len(os.listdir(directory_intern)) >= 15000:
        print("Directory " + directory_intern + " full")
        counter1 += 15000
        counter2 += 15000
        directory = "Pics " + str(counter1) + " - " + str(counter2)
        if os.path.isdir(directory):
            print("Directory already exists; skip creation")
        else:
            os.makedirs(directory, 0o755, False)
        os.chdir("..")

# now we start

# user has to set path for pictures
print("Please set download location (full path required): ")
path = sys.stdin.readline()

# set tags, if user want to download specific pictures
print("Set Tags (seperate multiple tags with a whitespace;" +
    " connect tags with more than one word with an underscore): ")
tags = sys.stdin.readline().strip("\n")

# chdir in $path and create directory if it not exists
if not os.path.isdir(path.rstrip()):
    os.makedirs(path.rstrip(), 0o755, True)
os.chdir(path.rstrip())
if not os.path.isdir("Tags: " + tags):
    os.makedirs("Tags: " + tags, 0o755, True)
os.chdir("Tags: " + tags)


# creating directory for pics
directory = "Pics " + str(counter1) + " - " + str(counter2)
if not os.path.isdir(directory):
    os.makedirs(directory, 0o755, True)

# let's start with downloading

for page_number in range(1, page_count()):

    print("Starting download in page " + str(page_number))

    domain.request("GET", "/post?page=" + str(page_number) +
    "&tags=" + tags.replace(" ", "+"))

    while True:
        try:
            index_page = domain.getresponse()
            break
        except http.client.BadStatusLine:
            domain.close()
            domain = http.client.HTTPConnection("konachan.com")
            domain.request("GET", "/post?page=" + str(page_number) +
             "&tags=" + tags.replace(" ", "+"))
            time.sleep(1)

    # after we got the response from konachan we need the source code
    index_page_source = str(index_page.read())

    # and now we need save every link on this page in a list
    pics_list = index_page_source.split("Post.register")

    directory_size(directory)

    # now we can search every line for the pic link
    for pic in pics_list:
        pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))

        # if we found the url we download the pic
        # but with whitespaces instead of "%20"
        if pic_url:
            name = name_regex.search(pic_url.group(0)).group(1)
            print("     Downloading pic:  " + name.replace("%20", " ") +
            " in directory: " + directory)

            # a little check if pic already exists
            existance = False
            for dir in os.listdir():
                os.chdir(dir)
                if os.path.isfile(name.replace("%20", " ")):
                    print("     Pic is already on your pc! Skip!")
                    existance = True
                os.chdir("..")

            if not existance:
                os.chdir(directory)
                image = urllib.request.URLopener()
                image.retrieve("http://" +
                    pic_url.group(0), urllib.request.url2pathname(name))
                print("     Download finished")
                os.chdir("..")