python_apps/crwlers/vitson_detail.py
2023-11-03 14:49:12 +09:00

78 lines
2.2 KiB
Python

#!/Users/maddiekorea/Workspace/bin/python
#version 201904
import requests, bs4, urllib, sys, re, math, logging, os
from urllib import parse
from datetime import datetime
import subprocess
startTime = datetime.now()
url = "https://www.vitsonmro.com/goods/goodsDetail.do"
def query(keyword) :
query = keyword.encode('utf-8')
return query
def parameters( query ) :
data = {
'goodsNo': query
}
return data
def headers() :
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.vitsonmro.com',
'Pragma': 'no-cache',
'Referer': 'https://www.vitsonmro.com',
'Save-Data': 'on',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
return headers
def pageRequest( url, parameters, headers ) :
resp = requests.get(url, params = parameters, headers = headers)
resp.raise_for_status()
resp.encoding='utf-8'
html = resp.text
return html
term = str(sys.argv[1])
htmlHead = pageRequest(url, parameters(query(term)), headers())
bs = bs4.BeautifulSoup(htmlHead, 'html.parser')
img = []
imgArr = bs.select('div#product_photo ul.goods_view_slider li img')
for i in range(len(imgArr)) :
Rimg = imgArr[i].get('src')
img.insert(i,Rimg)
directory = '/home/maddiekorea/imgs/crawl/' + datetime.now().strftime("%Y%m%d") + "_vitson"
if not os.path.exists(directory) : os.makedirs(directory)
for i in range(len(img)) :
ext = img[i].split(".")
index = len(ext) - 1
extension = ext[index]
filename = directory + "/" + term + "_" + str(i) + "." + extension
shellcmd = "wget -O " + filename + " " + '"' + img[i] + '"'
print(term + "\t" + img[i])
executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True)
try:
executeCrawl.communicate(timeout=80)
except subprocess.TimeoutExpired:
executeCrawl.terminate()
continue
consumtime = datetime.now() - startTime
logging.warning(term + "\t" + str(consumtime))