#!/usr/bin/env python import requests, bs4, urllib, sys, re, math, logging def getHTML(url) : headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Upgrade-Insecure-Requests': '1', 'Host': 'www.navimro.com' } resp = requests.get(url, headers = headers) resp.raise_for_status() resp.encoding='UTF-8' html = resp.text bs = bs4.BeautifulSoup(html, 'html.parser') return bs; url = str(sys.argv[1]) bs = getHTML(url) title = bs.select('div.spec div.product-detail-area')[0].getText().strip() promo = bs.select('div.spec table td i.icon-txt') brand = bs.select('div.spec table td div.brand-product-new span')[0].getText().strip() desc = str(bs.select('div.spec table tr.description td.desc_info')[0]) series = bs.select('div.option-table table.itemTable') seriesTitles = bs.select('div.option-table table.itemTable th') seriesContent = bs.select('div.option-table table.itemTable tbody tr.itemList') seriesTitleArray = dict() for i in range(len(seriesTitles)-3) : seriesTitleArray[i] = seriesTitles[i].getText().strip() promoText = "" for i in range(len(promo)) : promoText = promoText + promo[i].getText().strip() if i != (len(promo) - 1) : promoText = promoText + "," desc = re.sub("\|\<\/td\>","",desc) desc = re.sub("\<\!--.+?--\>","",desc) desc = re.sub("\n|\t|\r","",desc) seriesPrd = dict() tdText = dict() for j in range(len(seriesContent)) : td = seriesContent[j].select('td') tdText[j] = dict() for k in range(len(td)-5) : tdText[j][k] = td[k].getText().strip() tdText[j][k] = re.sub("\.[0-9]+\s","",tdText[j][k]) tdText[j][k] = re.sub("\\r|\\n|\\t|","",tdText[j][k]) tdText[j][k] = re.sub("할인특가|캠페인할인","",tdText[j][k]) seriesPrd[j] = tdText[j] output = url + "\t" + brand + "\t" + title + "\t" + promoText + "\t" + desc + "\t" output_add = dict() for l in range(len(seriesPrd)) : output_add[l] = "" for m in range(len(seriesTitleArray)) : output_add[l] = output_add[l] + "\t" + seriesTitleArray[m] + "\t" + seriesPrd[l][m] print(output + output_add[l]) logging.warning(url+ ":" + brand + ":" + title + ":" + str(l+1))