python_apps/crwlers/chulmulfetch.py
2023-11-03 14:49:12 +09:00

52 lines
1.4 KiB
Python

#!/usr/bin/env python
import requests, bs4, urllib, sys, logging, re, subprocess
from urllib import parse
from datetime import datetime
startTime = datetime.now()
def url(prdCode) :
url = 'http://www.metaldiy.com/item/itemView.do?itemId=' + prdCode
return url
def getHtml(url,encoding) :
resp = requests.get(url)
resp.raise_for_status()
resp.encoding = encoding
html = resp.text
bs = bs4.BeautifulSoup(html, 'html.parser')
return bs
def wget(url,type,prdCode) :
ext = url.split(".")
index = len(ext) - 1;
extension = ext[index]
outfilename = prdCode + "_" + type + "." + extension
shellcmd = "wget -O " + outfilename + " " + url
return shellcmd
prdCode = str(sys.argv[1]);
targetUrl = url(prdCode)
bs = getHtml(targetUrl,'UTF-8')
img = bs.select('img#zoom_goods')[0].get('src')
cmd = wget(img,"MAIN",prdCode)
executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
try :
executeCrawl.communicate(timeout=80)
except subprocess.TimeoutExpired :
executeCrawl.terminate()
dtimgsrc = bs.select('div.detail_goodsBox div#detail img')
for i in range(len(dtimgsrc)) :
dtimg = dtimgsrc[i].get('src')
cmd = wget(dtimg,"DT_" + str(i),prdCode)
executeCrawl = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
try :
executeCrawl.communicate(timeout=80)
except subprocess.TimeoutExpired :
executeCrawl.terminate()
consumtime = datetime.now() - startTime
logging.warning(prdCode + "\t" + str(consumtime))