python_apps/crwlers/solarzenfetch.py

#!/usr/bin/env python
import requests, bs4, urllib, sys, logging, re, subprocess
from urllib import parse
from datetime import datetime

startTime = datetime.now()

def url(type,prdCode) :
	if type == 'prd' :
		url = 'http://www.zenb2b.com/shop/goods/goods_popup_large.php?goodsno=' + prdCode
	if type == 'detail' :
		url = 'http://www.zenb2b.com/shop/goods/goods_view.php?&goodsno=' + prdCode
	return url

def getHtml(url,encoding) :
	resp = requests.get(url)
	resp.raise_for_status()
	resp.encoding = encoding
	html = resp.text
	bs = bs4.BeautifulSoup(html, 'html.parser')
	return bs

def wget(url,type,prdCode) :
	ext = url.split(".")
	index = len(ext) - 1;
	extension = ext[index]
	outfilename = prdCode + "_" + type + "." + extension
	shellcmd = "wget -O " + outfilename + " " + url
	return shellcmd


prdCode = str(sys.argv[1]);

targetUrl = url('prd',prdCode)
bs = getHtml(targetUrl,'EUC-KR')
img = bs.select('img#objImg')[0].get('src')
img = re.sub(r"^\.\.","http://www.zenb2b.com/shop",img)
cmd = wget(img,"MAIN",prdCode)
executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
try :
	executeCrawl.communicate(timeout=80)
except subprocess.TimeoutExpired :
	executeCrawl.terminate()

targetUrl = url('detail',prdCode)
bs = getHtml(targetUrl,'EUC-KR')
imgsrc = bs.select('div#contents table tr td p img')
for i in range(len(imgsrc)) :
	img = imgsrc[i].get('src')
	img = re.sub(r"^\/","http://www.zenb2b.com/",img)
	cmd = wget(img,"DT_" + str(i),prdCode)
	executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
	try :
		executeCrawl.communicate(timeout=80)
	except subprocess.TimeoutExpired :
		executeCrawl.terminate()

consumtime	 = datetime.now() - startTime
logging.warning(prdCode + "\t" + str(consumtime))