87 lines
2.4 KiB
Python
87 lines
2.4 KiB
Python
#!/Users/maddiekorea/Workspace/bin/python
|
|
#version 201904
|
|
import requests, bs4, urllib, sys, re, math, logging, os
|
|
from urllib import parse
|
|
from datetime import datetime
|
|
import subprocess
|
|
startTime = datetime.now()
|
|
|
|
url = "https://www.imarket.co.kr/product/MallDisplay.do"
|
|
|
|
def query(keyword) :
|
|
query = keyword.encode('euc-kr')
|
|
return query
|
|
|
|
def parameters( page, query ) :
|
|
data = {
|
|
'_method': 'Detail',
|
|
'sc.prdNo': query
|
|
}
|
|
return data
|
|
|
|
def headers() :
|
|
headers = {
|
|
'Content-Type': 'application/x-www-form-urlencoded',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Host': 'www.imarket.co.kr',
|
|
'Pragma': 'no-cache',
|
|
'Referer': 'https://www.imarket.co.kr/',
|
|
'Save-Data': 'on',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
|
|
}
|
|
return headers
|
|
|
|
def pageRequest( url, parameters, headers ) :
|
|
resp = requests.get(url, params = parameters, headers = headers)
|
|
resp.raise_for_status()
|
|
resp.encoding='EUC-KR'
|
|
html = resp.text
|
|
return html
|
|
|
|
term = str(sys.argv[1])
|
|
|
|
htmlHead = pageRequest(url, parameters(1,query(term)), headers())
|
|
bs = bs4.BeautifulSoup(htmlHead, 'html.parser')
|
|
|
|
img = []
|
|
|
|
img.insert(0,bs.select('div.img_big img#product_img_big')[0].get('src'))
|
|
img.insert(1,bs.select('meta#ogImage')[0].get('content'))
|
|
|
|
detail_img = bs.select('div.brand_detail img')
|
|
|
|
for i in range(len(detail_img)) :
|
|
dimg = detail_img[i].get('src')
|
|
img.insert((i+2),dimg)
|
|
|
|
for i in range(len(img)) :
|
|
ext = img[i].split(".")
|
|
index = len(ext) - 1
|
|
extension = ext[index]
|
|
|
|
directory = '/home/maddiekorea/imgs/crawl/' + datetime.now().strftime("%Y%m%d")
|
|
if not os.path.exists(directory) : os.makedirs(directory)
|
|
|
|
if i < 2 :
|
|
filename = directory + "/" + term + "_" + str(i) + "." + extension
|
|
else :
|
|
filename = directory + "/" + term + "_d_" + str(i-2) + "." + extension
|
|
|
|
shellcmd = "wget -O \"" + filename + "\" \"" + img[i] + "\""
|
|
print(term + "\t" + img[i])
|
|
executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True)
|
|
|
|
try:
|
|
executeCrawl.communicate(timeout=80)
|
|
except subprocess.TimeoutExpired:
|
|
executeCrawl.terminate()
|
|
# continue
|
|
|
|
consumtime = datetime.now() - startTime
|
|
logging.warning(term + "\t" + str(consumtime))
|