#!/Users/maddiekorea/Workspace/bin/python #version 201904 import requests, bs4, urllib, sys, re, math, logging, os from urllib import parse from datetime import datetime import subprocess startTime = datetime.now() url = "https://www.vitsonmro.com/goods/goodsDetail.do" def query(keyword) : query = keyword.encode('utf-8') return query def parameters( query ) : data = { 'goodsNo': query } return data def headers() : headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.vitsonmro.com', 'Pragma': 'no-cache', 'Referer': 'https://www.vitsonmro.com', 'Save-Data': 'on', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } return headers def pageRequest( url, parameters, headers ) : resp = requests.get(url, params = parameters, headers = headers) resp.raise_for_status() resp.encoding='utf-8' html = resp.text return html term = str(sys.argv[1]) htmlHead = pageRequest(url, parameters(query(term)), headers()) bs = bs4.BeautifulSoup(htmlHead, 'html.parser') img = [] imgArr = bs.select('div#product_photo ul.goods_view_slider li img') for i in range(len(imgArr)) : Rimg = imgArr[i].get('src') img.insert(i,Rimg) directory = '/home/maddiekorea/imgs/crawl/' + datetime.now().strftime("%Y%m%d") + "_vitson" if not os.path.exists(directory) : os.makedirs(directory) for i in range(len(img)) : ext = img[i].split(".") index = len(ext) - 1 extension = ext[index] filename = directory + "/" + term + "_" + str(i) + "." + extension shellcmd = "wget -O " + filename + " " + '"' + img[i] + '"' print(term + "\t" + img[i]) executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True) try: executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired: executeCrawl.terminate() continue consumtime = datetime.now() - startTime logging.warning(term + "\t" + str(consumtime))