Monday, 15 October 2012

A comic download life


download conan all episodes in once






#author: <limkokhole@facebook.com>
import os
import re
import subprocess
from BeautifulSoup import BeautifulSoup
last_failed_episode_index = 4 #change it to failed index if last time failed
url = "http://www.kkkmh.com/manhua/0710/ming-zhen-tan-ke-nan-conan.html"
host = "http://www.kkkmh.com"
def download(folder_name, picture_url):
    file_destination = "/".join([folder_name, picture_url.split("/")[-1]])
    command = "curl -vk -w '%{http_code}' -e '"+picture_url+"' -A 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6' '"+picture_url+"' -o '"+file_destination+"'" #don't follow redirect !!!
    print command
    proc = subprocess.Popen(command, shell=True, bufsize=2048, stdout=subprocess.PIPE, close_fds=True)
    http_code = proc.stdout.read()
    try:
        proc.communicate() #wait for complete
    except Exception, e:
        print "proc exception ", e #[Errno 10] No child processe
    if http_code == "302":
        print "302, retrying... :)"
        download(folder_name, picture_url)
def request_url(url):
    command = "curl -vLk -e '"+url+"' -A 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6' '"+url+"'"
    print command
    proc = subprocess.Popen(command, shell=True, bufsize=2048, stdout=subprocess.PIPE, close_fds=True)
    r = proc.stdout
    return r
r = request_url(url)
soup = BeautifulSoup(r)
episode_list = soup.html.findAll('ul')[-1]
full_episode_list = []
for episode in episode_list:
    full_episode_list.append([episode.find("a").get("title"), "".join([host, episode.find("a").get("href")])])
print "full_episode_list is ", full_episode_list
for index, episode in enumerate(full_episode_list):
    try:
        if index >= last_failed_episode_index:
            title = episode[0]
            folder_name = title.replace("/", "[]").replace("'", "\"")
            if len(folder_name) > 255:
                print "len(folder_name) > 255"
                folder_name = folder_name[:255]
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
            r = request_url(episode[1]).read()
            regex = re.compile(";pic\[.*?\][ \n]=[ \n]'(.*?)'", re.DOTALL | re.UNICODE | re.IGNORECASE)
            print r
            hex_code_list = regex.findall(r)
            print "hex_code_list is ", hex_code_list
            server_list_url = host + "/common/server.js".join([r.split("/common/server.js")[0].split("\"")[-1], r.split("/common/server.js")[1].split("\"")[0]])
            r = request_url(server_list_url).read()
            server_url = r.split("url:'")[1].split("'")[0]
            print "server_url is ", server_url
            picture_list = []
            for hex_code in hex_code_list:
                picture_list.append("".join([server_url, hex_code.decode("hex")]))
            print "picture_list is ", picture_list
            for picture_url in picture_list:
                download(folder_name, picture_url)
    except Exception, e:
        print "failed to download episode ", index
        print "because of ", e
        break