previously coded crawler in python
This commit is contained in:
146
Python/Konachan-Downloader_v3.py
Executable file
146
Python/Konachan-Downloader_v3.py
Executable file
@@ -0,0 +1,146 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import os
|
||||
import os.path
|
||||
import sys
|
||||
import http.client
|
||||
import urllib.request
|
||||
import time
|
||||
|
||||
# regexes
|
||||
url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)")
|
||||
name_regex = re.compile("image/.*?/(.*)")
|
||||
|
||||
# variable
|
||||
counter1, counter2 = 0, 15000
|
||||
tag_filter = None
|
||||
domain = http.client.HTTPConnection("konachan.com")
|
||||
|
||||
# little function to calculate the last page of search results
|
||||
|
||||
|
||||
def page_count():
|
||||
# open connection to konachan.com
|
||||
domain = http.client.HTTPConnection("konachan.com")
|
||||
|
||||
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
||||
while True:
|
||||
try:
|
||||
first_page = domain.getresponse()
|
||||
break
|
||||
except http.client.BadStatusLine:
|
||||
time.sleep(1)
|
||||
domain.close()
|
||||
domain = http.client.HTTPConnection("konachan.com")
|
||||
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
|
||||
|
||||
# we got our response, now it's time to find that number
|
||||
first_page_source = str(first_page.read())
|
||||
page_list = first_page_source.split("Next Page")
|
||||
number = 0
|
||||
for line in page_list:
|
||||
if re.search("(?<=\/post\?page\=)\d+", line):
|
||||
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
|
||||
else:
|
||||
number = 2
|
||||
return int(number)
|
||||
|
||||
# we don't want to save every picture in one directory.
|
||||
# so we create a new directory when we donwloaded 15k pics
|
||||
|
||||
|
||||
def directory_size(directory_intern):
|
||||
if len(os.listdir(directory_intern)) >= 15000:
|
||||
print("Directory " + directory_intern + " full")
|
||||
counter1 += 15000
|
||||
counter2 += 15000
|
||||
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
||||
if os.path.isdir(directory):
|
||||
print("Directory already exists; skip creation")
|
||||
else:
|
||||
os.makedirs(directory, 0o755, False)
|
||||
os.chdir("..")
|
||||
|
||||
# now we start
|
||||
|
||||
# user has to set path for pictures
|
||||
print("Please set download location (full path required): ")
|
||||
path = sys.stdin.readline()
|
||||
|
||||
# set tags, if user want to download specific pictures
|
||||
print("Set Tags (seperate multiple tags with a whitespace;" +
|
||||
" connect tags with more than one word with an underscore): ")
|
||||
tags = sys.stdin.readline().strip("\n")
|
||||
|
||||
# chdir in $path and create directory if it not exists
|
||||
if not os.path.isdir(path.rstrip()):
|
||||
os.makedirs(path.rstrip(), 0o755, True)
|
||||
os.chdir(path.rstrip())
|
||||
if not os.path.isdir("Tags: " + tags):
|
||||
os.makedirs("Tags: " + tags, 0o755, True)
|
||||
os.chdir("Tags: " + tags)
|
||||
|
||||
|
||||
# creating directory for pics
|
||||
directory = "Pics " + str(counter1) + " - " + str(counter2)
|
||||
if not os.path.isdir(directory):
|
||||
os.makedirs(directory, 0o755, True)
|
||||
|
||||
# let's start with downloading
|
||||
|
||||
for page_number in range(1, page_count()):
|
||||
|
||||
print("Starting download in page " + str(page_number))
|
||||
|
||||
domain.request("GET", "/post?page=" + str(page_number) +
|
||||
"&tags=" + tags.replace(" ", "+"))
|
||||
|
||||
while True:
|
||||
try:
|
||||
index_page = domain.getresponse()
|
||||
break
|
||||
except http.client.BadStatusLine:
|
||||
domain.close()
|
||||
domain = http.client.HTTPConnection("konachan.com")
|
||||
domain.request("GET", "/post?page=" + str(page_number) +
|
||||
"&tags=" + tags.replace(" ", "+"))
|
||||
time.sleep(1)
|
||||
|
||||
# after we got the response from konachan we need the source code
|
||||
index_page_source = str(index_page.read())
|
||||
|
||||
# and now we need save every link on this page in a list
|
||||
pics_list = index_page_source.split("Post.register")
|
||||
|
||||
directory_size(directory)
|
||||
|
||||
# now we can search every line for the pic link
|
||||
for pic in pics_list:
|
||||
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
|
||||
|
||||
# if we found the url we download the pic
|
||||
# but with whitespaces instead of "%20"
|
||||
if pic_url:
|
||||
name = name_regex.search(pic_url.group(0)).group(1)
|
||||
print(" Downloading pic: " + name.replace("%20", " ") +
|
||||
" in directory: " + directory)
|
||||
|
||||
# a little check if pic already exists
|
||||
existance = False
|
||||
for dir in os.listdir():
|
||||
os.chdir(dir)
|
||||
if os.path.isfile(name.replace("%20", " ")):
|
||||
print(" Pic is already on your pc! Skip!")
|
||||
existance = True
|
||||
os.chdir("..")
|
||||
|
||||
if not existance:
|
||||
os.chdir(directory)
|
||||
image = urllib.request.URLopener()
|
||||
image.retrieve("http://" +
|
||||
pic_url.group(0), urllib.request.url2pathname(name))
|
||||
print(" Download finished")
|
||||
os.chdir("..")
|
||||
Reference in New Issue
Block a user