import requests, sys, bs4, json, re, subprocess seq = str(sys.argv[1]) url = "https://smartstore.naver.com/conteenew/products/" + seq resp = requests.get(url) resp.raise_for_status() resp.encoding='UTF-8' html = resp.text bs = bs4.BeautifulSoup(html, 'html.parser') nvshit = bs.find('script', type="application/ld+json") nvshit = str(nvshit) nvshit = re.sub("^","",nvshit) nvshit = re.sub("$","",nvshit) data = json.loads(nvshit) nvshit2 = bs.select('script')[1] nvshit2 = str(nvshit2) nvshit2 = re.sub("^$","",nvshit2) nvshit2 = re.sub("^window\.__PRELOADED_STATE__=","",nvshit2) data2 = json.loads(nvshit2) productId = data["productID"] productNo = data2["product"]["A"]["productNo"] detailApiUrl = "https://smartstore.naver.com/i/v1/products/" + str(productId) + "/contents/" + str(productNo) + "/PC" resp2 = requests.get(detailApiUrl) resp2.raise_for_status() resp2.encoding='UTF-8' detailContent = resp2.text detail = json.loads(detailContent) content = detail["renderContent"] txtcontent = detail["textContent"] bs_real = bs4.BeautifulSoup(content, 'html.parser') imgArr = bs_real.select("img") texts = "

" texts = texts + re.sub("\ \ ","

",txtcontent) texts = texts + "

" texts = re.sub("

","",texts) txtbs = bs4.BeautifulSoup(texts, 'html.parser') para = txtbs.select("p") res = "
" for i in range(len(imgArr)): if i == 1: res = res + str(para[0]) if i == 2: for j in range(len(para)-1): res = res + str(para[ j + 1 ]) imgurl = imgArr[i].attrs['data-src'] if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY29udGVlbmV3X2luZm8uanBn?token=110e78c61af505948b52baab70886ddd": imgurl = "https://kr.object.ncloudstorage.com/distimgs/foot.jpg" if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX2RldGFpbF8xLmpwZw==?token=7880952c83e2f27e3bc32541a5c97c5b": imgurl = "https://kr.object.ncloudstorage.com/distimgs/prddesc.jpg" if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX2RldGFpbF8yLmpwZw==?token=9ca9ca7d674233b81216140fdde274e7": imgurl = "https://kr.object.ncloudstorage.com/distimgs/sizedesc.jpg" if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX2RldGFpbF8zLmpwZw==?token=a349ca69df48f078821a2afe857925d2": imgurl = "https://kr.object.ncloudstorage.com/distimgs/delivery.jpg" if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX3RpdC5wbmc=?token=cf833fc0edcc17068d811ac1aeb4fde5": imgurl = "https://kr.object.ncloudstorage.com/distimgs/canvas_title.png" if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvYXJ0d29ya190aXQucG5n?token=6b88945cbe05fa6aed3ee21cc6e92d16": imgurl = "https://kr.object.ncloudstorage.com/distimgs/title.png" if re.match("^https\:\/\/proxy\.smartstore\.naver\.com",imgurl) != None: path = "/Users/maddiekorea/Public/" newfilename = str(seq) + "_" + str(i) shellcmd = "wget -O " + path + newfilename + " " + imgurl executeCrawl = subprocess.Popen(shellcmd, stdout=subprocess.PIPE, shell=True) try: executeCrawl.communicate(timeout=80) except subprocess.TimeoutExpired: executeCrawl.terminate() from PIL import Image img = Image.open(path + newfilename) format = img.format shellcmd2 = "mv " + path + newfilename + " " + path + newfilename + "." + format executemv = subprocess.Popen(shellcmd2, stdout=subprocess.PIPE, shell=True) try: executemv.communicate(timeout=80) except subprocess.TimeoutExpired: executemv.terminate() newurl = "https://kr.object.ncloudstorage.com/distimgs/artwork/" + newfilename + "." + format imgurl = newurl res = res + "

" res = res + "
" print( str(seq) + "\t" + str(url) + "\t" + res )