Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 256 additions & 0 deletions crawlers/food/crawl_brand_food.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
from seleniumbase import SB
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json
import os

# ํ™˜๊ฒฝ ๋ณ€์ˆ˜
os.environ["UC_DRIVER_PATH"] = "/opt/airflow/uc_driver"

def get_brand(brand_name, brand_code) -> pd.DataFrame:
url = f"https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd={brand_code}"
data = []
collected_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

with SB(uc=True, test=True, headless=True) as sb:
sb.uc_open_with_reconnect(url, reconnect_time=20)

page = 1
while True:
if page > 1:
try:
# ํŽ˜์ด์ง€๋„ค์ด์…˜ ๋ฒ„ํŠผ ํด๋ฆญ (ํŽ˜์ด์ง€๊ฐ€ ์—†์œผ๋ฉด break)
sb.click(f"div.pageing a[data-page-no='{page}']")
time.sleep(2) # ajax ๋กœ๋”ฉ ๋Œ€๊ธฐ
except Exception as e:
print(f"{page}ํŽ˜์ด์ง€ ๋ฒ„ํŠผ ํด๋ฆญ ์‹คํŒจ ๋˜๋Š” ๋” ์ด์ƒ ํŽ˜์ด์ง€ ์—†์Œ: {e}")
break

html = sb.driver.page_source
soup = BeautifulSoup(html, "html.parser")

# ๋ธŒ๋žœ๋“œ๋ช… ์ถ”์ถœ
try:
brand = soup.select_one("h2.title-detail-brand").text.strip()
except Exception:
brand = brand_name

# ์ƒํ’ˆ ๋ชฉ๋ก ์ถ”์ถœ
items = soup.select("ul.prod-list.goodsProd > li")
if not items:
print(f"{page}ํŽ˜์ด์ง€์— ์ƒํ’ˆ์ด ์—†์Šต๋‹ˆ๋‹ค.")
break

for item in items:
is_pb = 1
try:
name = item.select_one("span.prod-name.double-line").text.strip()
except Exception:
name = ""
try:
a_tag = item.select_one("a[data-ref-goodsno]")
goods_no = a_tag["data-ref-goodsno"] if a_tag else ""
except Exception:
goods_no = ""
try:
price_final = item.select_one("strong.total").text.strip().replace("์›", "").replace(",", "").replace("~", "")
except Exception:
price_final = ""
try:
price_original = item.select_one("span.origin").text.strip().replace("์›", "").replace(",", "")
except Exception:
price_original = ""
try:
flag_spans = item.select("div.flags span.flag")
flag_list = [span.text.strip() for span in flag_spans if span.text.strip()]
flag_str = ",".join(flag_list) if flag_list else ""
except Exception:
flag_str = ""
try:
soldout_flag = item.select_one("span.status_flag.soldout")
is_soldout = bool(soldout_flag)
except Exception:
is_soldout = False

data.append({
"brandName": brand,
"isPB": is_pb,
"goodsName": name,
"goodsNo": goods_no,
"salePrice": price_final,
"originalPrice": price_original,
"flagList": flag_str,
"isSoldout": is_soldout,
"createdAt": collected_at
})
# ๋‹ค์Œ ํŽ˜์ด์ง€๋กœ
page += 1

return pd.DataFrame(data)

def get_product_detail_info(sb, goods_no: str) -> dict:
url = f"https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo={goods_no}"
sb.uc_open_with_reconnect(url, reconnect_time=5)
time.sleep(1)
html = sb.driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# ์นดํ…Œ๊ณ ๋ฆฌ ์ถ”์ถœ
try:
category = soup.select_one("a.cate_y#midCatNm").text.strip()
except Exception as e:
print(f"์นดํ…Œ๊ณ ๋ฆฌ ์ถ”์ถœ ์‹คํŒจ: {e}")
category = ""

# ๋Œ€ํ‘œ ์ฝ”๋ฉ˜ํŠธ
try:
comment_tag = soup.select_one("p.img_face em")
total_comment = comment_tag.text.strip() if comment_tag else ""
except Exception as e:
print(f"๋Œ€ํ‘œ ์ฝ”๋ฉ˜ํŠธ ํŒŒ์‹ฑ ์‹คํŒจ: {e}")
total_comment = ""

# ์ด๋ฆฌ๋ทฐ์ˆ˜
try:
review_info = soup.select_one("#repReview em")
total_review = int(review_info.text.strip().replace("(", "").replace("๊ฑด)", "").replace(",", ""))
except Exception as e:
print(f"์ด ๋ฆฌ๋ทฐ์ˆ˜ ํŒŒ์‹ฑ ์‹คํŒจ: {e}")
total_review = 0

# ๋ฆฌ๋ทฐํ‰์ 
try:
review_score = soup.select_one("#repReview b")
review_score = float(review_score.text.strip())
except Exception as e:
print(f"๋ฆฌ๋ทฐํ‰์  ํŒŒ์‹ฑ ์‹คํŒจ: {e}")
review_score = None

# ๋ฆฌ๋ทฐ ๋ถ„ํฌ ๊ธฐ๋ณธ๊ฐ’
pctOf5 = pctOf4 = pctOf3 = pctOf2 = pctOf1 = None
review_detail = ""

# ๋ฆฌ๋ทฐ๊ฐ€ 1๊ฑด ์ด์ƒ ์žˆ์„ ๋•Œ๋งŒ ๋ฆฌ๋ทฐํƒญ ํด๋ฆญ ๋ฐ ๋ถ„ํฌ ์ˆ˜์ง‘
if total_review > 0:
try:
sb.click("a.goods_reputation")
WebDriverWait(sb.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "ul.graph_list span.per"))
)
percent_elements = sb.find_elements("css selector", "ul.graph_list span.per")
percent_list = [el.text.strip() for el in percent_elements]
if len(percent_list) == 5:
pctOf5 = percent_list[0]
pctOf4 = percent_list[1]
pctOf3 = percent_list[2]
pctOf2 = percent_list[3]
pctOf1 = percent_list[4]

# reviewDetail ์ •๋ณด
review_detail = []
polls = sb.find_elements("css selector", "dl.poll_type2.type3")
for poll in polls:
try:
title = poll.find_element("css selector", "span").text.strip()
li_tags = poll.find_elements("css selector", "ul.list > li")
for li in li_tags:
label = li.find_element("css selector", "span.txt").text.strip()
percent = li.find_element("css selector", "em.per").text.strip()
review_detail.append({
"type": title,
"value": label,
"gauge": percent
})
except Exception as e:
print(f"๋ฆฌ๋ทฐ ์„ค๋ฌธ ์ˆ˜์ง‘ ์˜ค๋ฅ˜: {e}")
review_detail = json.dumps(review_detail, ensure_ascii=False)

except Exception as e:
print("๋ฆฌ๋ทฐ ์ •๋ณด ์—†์Œ:", e)

# === ์ƒ์„ธ์ŠคํŽ™(๊ตฌ๋งค์ •๋ณด) ์ถ”์ถœ ===
# ๊ตฌ๋งค์ •๋ณด ํƒญ ํด๋ฆญ
try:
sb.click("a.goods_buyinfo")
time.sleep(1) # ajax ๋กœ๋”ฉ ๋Œ€๊ธฐ
html = sb.driver.page_source
soup = BeautifulSoup(html, 'html.parser')
except Exception as e:
print("๊ตฌ๋งค์ •๋ณด ํƒญ ํด๋ฆญ ์‹คํŒจ:", e)

# === ํ•œ๊ธ€ ํ‚ค โ†’ ์˜์–ด ํ‚ค ๋งคํ•‘ ===
title_map = {
"์šฉ๋Ÿ‰": "capacity",
"์ฃผ์š” ์‚ฌ์–‘": "detail",
"๋ชจ๋“  ์„ฑ๋ถ„": "ingredient"
}

# ๊ธฐ๋ณธ๊ฐ’ ์„ธํŒ…
detail_spec = {
"capacity": "",
"detail": "",
"ingredient": ""
}

try:
dl_tags = soup.select("div#artcInfo dl.detail_info_list")
for dl in dl_tags:
dt = dl.select_one("dt")
dd = dl.select_one("dd")
if dt and dd:
dt_text = dt.text.strip()
dd_text = dd.text.strip()

for kr_title, en_key in title_map.items():
if kr_title in dt_text:
detail_spec[en_key] = dd_text
except Exception as e:
print(f"[์ƒ์„ธ ์ŠคํŽ™ ํŒŒ์‹ฑ ์˜ค๋ฅ˜]: {e}")

return {
"category": category,
"totalComment": total_comment,
"numOfReviews": total_review,
"avgReview": review_score,
"pctOf5": pctOf5,
"pctOf4": pctOf4,
"pctOf3": pctOf3,
"pctOf2": pctOf2,
"pctOf1": pctOf1,
"reviewDetail": review_detail,
**detail_spec,
}

##### ์‹คํ–‰ ์ฝ”๋“œ #####
PB_BRAND_CODE_DICT = {
"๋ฐ”์ด์˜คํž ๋ณด": "A000897",
"๋ธŒ๋ง๊ทธ๋ฆฐ": "A002253",
"์›จ์ดํฌ๋ฉ”์ดํฌ": "A001240",
"์ปฌ๋Ÿฌ๊ทธ๋žจ": "A002712",
"ํ•„๋ฆฌ๋ฐ€๋ฆฌ": "A002502",
"์•„์ด๋””์–ผํฌ๋งจ": "A001643",
"๋ผ์šด๋“œ์–ด๋ผ์šด๋“œ": "A001306",
"์‹๋ฌผ๋‚˜๋ผ": "A000036",
"์ผ€์–ดํ”Œ๋Ÿฌ์Šค": "A003339",
"ํƒ„ํƒ„": "A015673",
"๋”œ๋ผ์ดํŠธ ํ”„๋กœ์ ํŠธ": "A003361",
}

# for brand_name, brand_code in PB_BRAND_CODE_DICT.items():
# df = get_brand("A000036")
#
# with SB(uc=True, test=True) as sb:
# detail_list = []
# for goods_no in df['goodsNo']:
# detail = get_product_detail_info(sb, goods_no)
# detail_list.append(detail)
# detail_df = pd.DataFrame(detail_list)
# result_df = pd.concat([df.reset_index(drop=True), detail_df.reset_index(drop=True)], axis=1)
#
# now_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# result_df.to_json(f'suncare_result_{now_str}.json', orient='records', force_ascii=False, indent=2)
Loading