#!/Users/maddiekorea/Workspace/bin/python #version 201904 import requests, bs4, urllib, sys, re, math, logging, os from urllib import parse from datetime import datetime import subprocess startTime = datetime.now() url = "https://www.metaldiy.com/item/itemView.do" def query(keyword) : query = keyword.encode('utf-8') return query def parameters( query ) : data = { 'itemId': query } return data def headers() : headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.metaldiy.com', 'Pragma': 'no-cache', 'Referer': 'https://www.metaldiy.com', 'Save-Data': 'on', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } return headers def pageRequest( url, parameters, headers ) : resp = requests.get(url, params = parameters, headers = headers) resp.raise_for_status() resp.encoding='utf-8' html = resp.text return html term = str(sys.argv[1]) htmlHead = pageRequest(url, parameters(query(term)), headers()) bs = bs4.BeautifulSoup(htmlHead, 'html.parser') img = [] imgArr = bs.select('img#zoom_goods') print(imgArr[0].get('src')) directory = '/home/maddiekorea/imgs/crawl/' + datetime.now().strftime("%Y%m%d") + "_culmul" if not os.path.exists(directory) : os.makedirs(directory) #for i in range(len(img)) : ext = imgArr[0].get('src').split(".") print(ext) index = len(ext) - 1 extension = ext[index] print(extension) filename = directory + "/" + term + "." + extension shellcmd = "wget -O " + filename + " " + '"' + imgArr[0].get('src') + '"' executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True) try: executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired: executeCrawl.terminate() consumtime = datetime.now() - startTime logging.warning(term + "\t" + str(consumtime))