59 lines
1.7 KiB
Python
59 lines
1.7 KiB
Python
#!/usr/bin/env python
|
|
import requests, bs4, urllib, sys, logging, re, subprocess
|
|
from urllib import parse
|
|
from datetime import datetime
|
|
|
|
startTime = datetime.now()
|
|
|
|
def url(type,prdCode) :
|
|
if type == 'prd' :
|
|
url = 'http://www.zenb2b.com/shop/goods/goods_popup_large.php?goodsno=' + prdCode
|
|
if type == 'detail' :
|
|
url = 'http://www.zenb2b.com/shop/goods/goods_view.php?&goodsno=' + prdCode
|
|
return url
|
|
|
|
def getHtml(url,encoding) :
|
|
resp = requests.get(url)
|
|
resp.raise_for_status()
|
|
resp.encoding = encoding
|
|
html = resp.text
|
|
bs = bs4.BeautifulSoup(html, 'html.parser')
|
|
return bs
|
|
|
|
def wget(url,type,prdCode) :
|
|
ext = url.split(".")
|
|
index = len(ext) - 1;
|
|
extension = ext[index]
|
|
outfilename = prdCode + "_" + type + "." + extension
|
|
shellcmd = "wget -O " + outfilename + " " + url
|
|
return shellcmd
|
|
|
|
|
|
prdCode = str(sys.argv[1]);
|
|
|
|
targetUrl = url('prd',prdCode)
|
|
bs = getHtml(targetUrl,'EUC-KR')
|
|
img = bs.select('img#objImg')[0].get('src')
|
|
img = re.sub(r"^\.\.","http://www.zenb2b.com/shop",img)
|
|
cmd = wget(img,"MAIN",prdCode)
|
|
executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
|
|
try :
|
|
executeCrawl.communicate(timeout=80)
|
|
except subprocess.TimeoutExpired :
|
|
executeCrawl.terminate()
|
|
|
|
targetUrl = url('detail',prdCode)
|
|
bs = getHtml(targetUrl,'EUC-KR')
|
|
imgsrc = bs.select('div#contents table tr td p img')
|
|
for i in range(len(imgsrc)) :
|
|
img = imgsrc[i].get('src')
|
|
img = re.sub(r"^\/","http://www.zenb2b.com/",img)
|
|
cmd = wget(img,"DT_" + str(i),prdCode)
|
|
executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
|
|
try :
|
|
executeCrawl.communicate(timeout=80)
|
|
except subprocess.TimeoutExpired :
|
|
executeCrawl.terminate()
|
|
|
|
consumtime = datetime.now() - startTime
|
|
logging.warning(prdCode + "\t" + str(consumtime)) |