python_apps/crwlers/navimro_cat.py

#!/usr/bin/env python
import requests, bs4, urllib, sys, re, math

def getHTML(url) :
	headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
		'Accept-Encoding': 'gzip, deflate, br',
		'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
		'Upgrade-Insecure-Requests': '1',
		'Host': 'www.navimro.com'
	}
	resp = requests.get(url, headers = headers)
	resp.raise_for_status()
	resp.encoding='UTF-8'
	html = resp.text
	bs = bs4.BeautifulSoup(html, 'html.parser')
	return bs;


page = 1
rank = 1

while True :
	url = str(sys.argv[1]) + "/page-" + str(page) + "/"
	bs = getHTML(url)
	_lists = bs.select('div.product-list.clearFix div.product-list-area ul.clearFix li')
	if len(_lists) == 0 :
		break
	else :
		for i in range(len(_lists)) :
			promoText = ''
			promo = _lists[i].select('div.item__wrap i.icon-txt')
			if len(promo) != 0 :
				for j in range(len(promo)) :
					promoText = promoText + promo[j].getText().strip()
					if (j+1) != len(promo) : promoText = promoText + ","
			seriesNum = _lists[i].select('div.item__qty')[0].getText().strip()
			name = _lists[i].select('a.item__title')[0].getText().strip()
			brand = _lists[i].select('a.item__brand')[0].getText().strip()
			linkStr = 'https://www.navimro.com' + _lists[i].select('a.item__img')[0].get('href')
			price = _lists[i].select('p.item__price strong.fl')
			if len(price) == 0 :
				priceText = _lists[i].select('div.item__wrap p.price strong')[0].getText().strip()
				priceTax = priceText
			else :
				priceText = re.sub(r"\,","",price[0].getText().strip())
				priceTax = math.ceil(int(priceText) * 1.1)
			print(str(sys.argv[1]) + "\t" + str(rank) + "\t" + promoText + "\t" + seriesNum + "\t" + brand + "\t" + name + "\t" + str(priceText) + "\t" + str(priceTax) + "\t" + linkStr)
			rank = rank + 1
		page = page + 1