66 lines
2.3 KiB
Python
66 lines
2.3 KiB
Python
#!/usr/bin/env python
|
|
import requests, bs4, urllib, sys, re, math, logging
|
|
|
|
def getHTML(url) :
|
|
headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Host': 'www.navimro.com'
|
|
}
|
|
resp = requests.get(url, headers = headers)
|
|
resp.raise_for_status()
|
|
resp.encoding='UTF-8'
|
|
html = resp.text
|
|
bs = bs4.BeautifulSoup(html, 'html.parser')
|
|
return bs;
|
|
|
|
url = str(sys.argv[1])
|
|
|
|
bs = getHTML(url)
|
|
|
|
title = bs.select('div.spec div.product-detail-area')[0].getText().strip()
|
|
promo = bs.select('div.spec table td i.icon-txt')
|
|
brand = bs.select('div.spec table td div.brand-product-new span')[0].getText().strip()
|
|
desc = str(bs.select('div.spec table tr.description td.desc_info')[0])
|
|
series = bs.select('div.option-table table.itemTable')
|
|
seriesTitles = bs.select('div.option-table table.itemTable th')
|
|
seriesContent = bs.select('div.option-table table.itemTable tbody tr.itemList')
|
|
|
|
seriesTitleArray = dict()
|
|
|
|
for i in range(len(seriesTitles)-3) :
|
|
seriesTitleArray[i] = seriesTitles[i].getText().strip()
|
|
|
|
promoText = ""
|
|
for i in range(len(promo)) :
|
|
promoText = promoText + promo[i].getText().strip()
|
|
if i != (len(promo) - 1) : promoText = promoText + ","
|
|
|
|
desc = re.sub("\<td.+?\>|\<\/td\>","",desc)
|
|
desc = re.sub("\<\!--.+?--\>","",desc)
|
|
desc = re.sub("\n|\t|\r","",desc)
|
|
|
|
seriesPrd = dict()
|
|
tdText = dict()
|
|
|
|
for j in range(len(seriesContent)) :
|
|
td = seriesContent[j].select('td')
|
|
tdText[j] = dict()
|
|
for k in range(len(td)-5) :
|
|
tdText[j][k] = td[k].getText().strip()
|
|
tdText[j][k] = re.sub("\.[0-9]+\s","",tdText[j][k])
|
|
tdText[j][k] = re.sub("\\r|\\n|\\t|","",tdText[j][k])
|
|
tdText[j][k] = re.sub("할인특가|캠페인할인","",tdText[j][k])
|
|
seriesPrd[j] = tdText[j]
|
|
|
|
output = url + "\t" + brand + "\t" + title + "\t" + promoText + "\t" + desc + "\t"
|
|
|
|
output_add = dict()
|
|
for l in range(len(seriesPrd)) :
|
|
output_add[l] = ""
|
|
for m in range(len(seriesTitleArray)) :
|
|
output_add[l] = output_add[l] + "\t" + seriesTitleArray[m] + "\t" + seriesPrd[l][m]
|
|
print(output + output_add[l])
|
|
|
|
logging.warning(url+ ":" + brand + ":" + title + ":" + str(l+1)) |