#!/usr/bin/env python import requests, bs4, urllib, sys, logging, re, subprocess from urllib import parse from datetime import datetime startTime = datetime.now() def url(type,prdCode) : if type == 'prd' : url = 'http://www.zenb2b.com/shop/goods/goods_popup_large.php?goodsno=' + prdCode if type == 'detail' : url = 'http://www.zenb2b.com/shop/goods/goods_view.php?&goodsno=' + prdCode return url def getHtml(url,encoding) : resp = requests.get(url) resp.raise_for_status() resp.encoding = encoding html = resp.text bs = bs4.BeautifulSoup(html, 'html.parser') return bs def wget(url,type,prdCode) : ext = url.split(".") index = len(ext) - 1; extension = ext[index] outfilename = prdCode + "_" + type + "." + extension shellcmd = "wget -O " + outfilename + " " + url return shellcmd prdCode = str(sys.argv[1]); targetUrl = url('prd',prdCode) bs = getHtml(targetUrl,'EUC-KR') img = bs.select('img#objImg')[0].get('src') img = re.sub(r"^\.\.","http://www.zenb2b.com/shop",img) cmd = wget(img,"MAIN",prdCode) executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) try : executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired : executeCrawl.terminate() targetUrl = url('detail',prdCode) bs = getHtml(targetUrl,'EUC-KR') imgsrc = bs.select('div#contents table tr td p img') for i in range(len(imgsrc)) : img = imgsrc[i].get('src') img = re.sub(r"^\/","http://www.zenb2b.com/",img) cmd = wget(img,"DT_" + str(i),prdCode) executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) try : executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired : executeCrawl.terminate() consumtime = datetime.now() - startTime logging.warning(prdCode + "\t" + str(consumtime))