download conan all episodes in once
#author: <limkokhole@facebook.com>
import os
import re
import subprocess
from BeautifulSoup import BeautifulSoup
last_failed_episode_index = 4 #change it to failed index if last time failed
url = "http://www.kkkmh.com/manhua/0710/ming-zhen-tan-ke-nan-conan.html"
host = "http://www.kkkmh.com"
def download(folder_name, picture_url):
file_destination = "/".join([folder_name, picture_url.split("/")[-1]])
command = "curl -vk -w '%{http_code}' -e '"+picture_url+"' -A 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6' '"+picture_url+"' -o '"+file_destination+"'" #don't follow redirect !!!
print command
proc = subprocess.Popen(command, shell=True, bufsize=2048, stdout=subprocess.PIPE, close_fds=True)
http_code = proc.stdout.read()
try:
proc.communicate() #wait for complete
except Exception, e:
print "proc exception ", e #[Errno 10] No child processe
if http_code == "302":
print "302, retrying... :)"
download(folder_name, picture_url)
def request_url(url):
command = "curl -vLk -e '"+url+"' -A 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6' '"+url+"'"
print command
proc = subprocess.Popen(command, shell=True, bufsize=2048, stdout=subprocess.PIPE, close_fds=True)
r = proc.stdout
return r
r = request_url(url)
soup = BeautifulSoup(r)
episode_list = soup.html.findAll('ul')[-1]
full_episode_list = []
for episode in episode_list:
full_episode_list.append([episode.find("a").get("title"), "".join([host, episode.find("a").get("href")])])
print "full_episode_list is ", full_episode_list
for index, episode in enumerate(full_episode_list):
try:
if index >= last_failed_episode_index:
title = episode[0]
folder_name = title.replace("/", "[]").replace("'", "\"")
if len(folder_name) > 255:
print "len(folder_name) > 255"
folder_name = folder_name[:255]
if not os.path.exists(folder_name):
os.makedirs(folder_name)
r = request_url(episode[1]).read()
regex = re.compile(";pic\[.*?\][ \n]=[ \n]'(.*?)'", re.DOTALL | re.UNICODE | re.IGNORECASE)
print r
hex_code_list = regex.findall(r)
print "hex_code_list is ", hex_code_list
server_list_url = host + "/common/server.js".join([r.split("/common/server.js")[0].split("\"")[-1], r.split("/common/server.js")[1].split("\"")[0]])
r = request_url(server_list_url).read()
server_url = r.split("url:'")[1].split("'")[0]
print "server_url is ", server_url
picture_list = []
for hex_code in hex_code_list:
picture_list.append("".join([server_url, hex_code.decode("hex")]))
print "picture_list is ", picture_list
for picture_url in picture_list:
download(folder_name, picture_url)
except Exception, e:
print "failed to download episode ", index
print "because of ", e
break
No comments:
Post a Comment