python_apps/crwlers/conteenew/smartStoreDetail.py
2023-11-03 14:49:12 +09:00

107 lines
4.2 KiB
Python

import requests, sys, bs4, json, re, subprocess
seq = str(sys.argv[1])
url = "https://smartstore.naver.com/conteenew/products/" + seq
resp = requests.get(url)
resp.raise_for_status()
resp.encoding='UTF-8'
html = resp.text
bs = bs4.BeautifulSoup(html, 'html.parser')
nvshit = bs.find('script', type="application/ld+json")
nvshit = str(nvshit)
nvshit = re.sub("^<script(.+?)>","",nvshit)
nvshit = re.sub("</script>$","",nvshit)
data = json.loads(nvshit)
nvshit2 = bs.select('script')[1]
nvshit2 = str(nvshit2)
nvshit2 = re.sub("^<script>","",nvshit2)
nvshit2 = re.sub("</script>$","",nvshit2)
nvshit2 = re.sub("^window\.__PRELOADED_STATE__=","",nvshit2)
data2 = json.loads(nvshit2)
productId = data["productID"]
productNo = data2["product"]["A"]["productNo"]
detailApiUrl = "https://smartstore.naver.com/i/v1/products/" + str(productId) + "/contents/" + str(productNo) + "/PC"
resp2 = requests.get(detailApiUrl)
resp2.raise_for_status()
resp2.encoding='UTF-8'
detailContent = resp2.text
detail = json.loads(detailContent)
content = detail["renderContent"]
txtcontent = detail["textContent"]
bs_real = bs4.BeautifulSoup(content, 'html.parser')
imgArr = bs_real.select("img")
texts = "<p>"
texts = texts + re.sub("\ \ ","</p><p>",txtcontent)
texts = texts + "</p>"
texts = re.sub("<p></p>","",texts)
txtbs = bs4.BeautifulSoup(texts, 'html.parser')
para = txtbs.select("p")
res = "<center>"
for i in range(len(imgArr)):
if i == 1: res = res + str(para[0])
if i == 2:
for j in range(len(para)-1):
res = res + str(para[ j + 1 ])
imgurl = imgArr[i].attrs['data-src']
if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY29udGVlbmV3X2luZm8uanBn?token=110e78c61af505948b52baab70886ddd":
imgurl = "https://kr.object.ncloudstorage.com/distimgs/foot.jpg"
if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX2RldGFpbF8xLmpwZw==?token=7880952c83e2f27e3bc32541a5c97c5b":
imgurl = "https://kr.object.ncloudstorage.com/distimgs/prddesc.jpg"
if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX2RldGFpbF8yLmpwZw==?token=9ca9ca7d674233b81216140fdde274e7":
imgurl = "https://kr.object.ncloudstorage.com/distimgs/sizedesc.jpg"
if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX2RldGFpbF8zLmpwZw==?token=a349ca69df48f078821a2afe857925d2":
imgurl = "https://kr.object.ncloudstorage.com/distimgs/delivery.jpg"
if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvY2FudmFzX3RpdC5wbmc=?token=cf833fc0edcc17068d811ac1aeb4fde5":
imgurl = "https://kr.object.ncloudstorage.com/distimgs/canvas_title.png"
if imgurl == "https://proxy.smartstore.naver.com/img/aW1hZ2UuY29udGVlbmV3LmNvbS9ncmFmb2xpby9kZXRhaWwvYXJ0d29ya190aXQucG5n?token=6b88945cbe05fa6aed3ee21cc6e92d16":
imgurl = "https://kr.object.ncloudstorage.com/distimgs/title.png"
if re.match("^https\:\/\/proxy\.smartstore\.naver\.com",imgurl) != None:
path = "/Users/maddiekorea/Public/"
newfilename = str(seq) + "_" + str(i)
shellcmd = "wget -O " + path + newfilename + " " + imgurl
executeCrawl = subprocess.Popen(shellcmd, stdout=subprocess.PIPE, shell=True)
try:
executeCrawl.communicate(timeout=80)
except subprocess.TimeoutExpired:
executeCrawl.terminate()
from PIL import Image
img = Image.open(path + newfilename)
format = img.format
shellcmd2 = "mv " + path + newfilename + " " + path + newfilename + "." + format
executemv = subprocess.Popen(shellcmd2, stdout=subprocess.PIPE, shell=True)
try:
executemv.communicate(timeout=80)
except subprocess.TimeoutExpired:
executemv.terminate()
newurl = "https://kr.object.ncloudstorage.com/distimgs/artwork/" + newfilename + "." + format
imgurl = newurl
res = res + "<p align=\"center\"><img src=\"" + imgurl + "\" align=\"center\" /></p>"
res = res + "</center>"
print(
str(seq) + "\t" +
str(url) + "\t" + res
)