#!/usr/bin/env python import requests, bs4, urllib, sys, logging, re, subprocess from urllib import parse from datetime import datetime startTime = datetime.now() def url(prdCode) : url = 'http://www.metaldiy.com/item/itemView.do?itemId=' + prdCode return url def getHtml(url,encoding) : resp = requests.get(url) resp.raise_for_status() resp.encoding = encoding html = resp.text bs = bs4.BeautifulSoup(html, 'html.parser') return bs def wget(url,type,prdCode) : ext = url.split(".") index = len(ext) - 1; extension = ext[index] outfilename = prdCode + "_" + type + "." + extension shellcmd = "wget -O " + outfilename + " " + url return shellcmd prdCode = str(sys.argv[1]); targetUrl = url(prdCode) bs = getHtml(targetUrl,'UTF-8') img = bs.select('img#zoom_goods')[0].get('src') cmd = wget(img,"MAIN",prdCode) executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) try : executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired : executeCrawl.terminate() dtimgsrc = bs.select('div.detail_goodsBox div#detail img') for i in range(len(dtimgsrc)) : dtimg = dtimgsrc[i].get('src') cmd = wget(dtimg,"DT_" + str(i),prdCode) executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) try : executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired : executeCrawl.terminate() consumtime = datetime.now() - startTime logging.warning(prdCode + "\t" + str(consumtime))