#!/Users/maddiekorea/Workspace/bin/python #version 202103 import requests, bs4, urllib, sys, re, math, logging, os from urllib import parse from datetime import datetime import subprocess startTime = datetime.now() url = "https://www.imarket.co.kr/product/MallDisplay.do" def query(keyword) : query = keyword.encode('euc-kr') return query def parameters( page, query ) : data = { '_method': 'Detail', 'sc.prdNo': query } return data def headers() : headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.imarket.co.kr', 'Pragma': 'no-cache', 'Referer': 'http://www.imarket.co.kr/', 'Save-Data': 'on', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } return headers def pageRequest( url, parameters, headers ) : resp = requests.get(url, params = parameters, headers = headers) resp.raise_for_status() resp.encoding='EUC-KR' html = resp.text return html term = str(sys.argv[1]) htmlHead = pageRequest(url, parameters(1,query(term)), headers()) bs = bs4.BeautifulSoup(htmlHead, 'html.parser') img = [] #img.insert(0,bs.select('div.img_big img#product_img_big')[0].get('src')) #img.insert(1,bs.select('meta#ogImage')[0].get('content')) detail_img = bs.select('div.brand_detail img') for i in range(len(detail_img)) : dimg = detail_img[i].get('src') if "imarket.co.kr" not in dimg : img.insert(i,dimg) for i in range(len(img)) : ext = img[i].split(".") index = len(ext) - 1 extension = ext[index].upper() directory = '/home/maddiekorea/imgs/crawl/_chrome_' + datetime.now().strftime("%Y%m%d") if not os.path.exists(directory) : os.makedirs(directory) #if i < 2 : # filename = directory + "/" + term + "_" + str(i) + "." + extension #else : filename = directory + "/" + term + "_d_" + str(i) + "." + extension shellcmd = "wget --tries=2 --timeout=10 -O \"" + filename + "\" \"" + img[i] + "\"" print(term + "\t" + img[i] + "\t" + term + "_d_" + str(i) + "." + extension) executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True) try: executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired: executeCrawl.terminate() # continue consumtime = datetime.now() - startTime logging.warning(term + "\t" + str(consumtime))