Skip to content

Commit 526d48a

Browse files
committed
feat: 충돌해결
2 parents 2be4805 + 7da0b8b commit 526d48a

5 files changed

Lines changed: 625 additions & 301 deletions

File tree

Dockerfile

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<<<<<<< HEAD
12
FROM --platform=linux/amd64 apache/airflow:2.9.1
23

34
# Chrome 설치는 루트 권한이 필요하므로
@@ -66,3 +67,52 @@ USER airflow
6667

6768
# COPY requirements.txt .
6869
# RUN pip install --no-cache-dir -r requirements.txt
70+
=======
71+
FROM --platform=linux/amd64 apache/airflow:2.9.1
72+
73+
# Chrome 설치는 루트 권한이 필요하므로
74+
USER root
75+
76+
RUN apt-get update && apt-get install -y \
77+
wget \
78+
gnupg2 \
79+
curl \
80+
unzip \
81+
fonts-liberation \
82+
libappindicator3-1 \
83+
libasound2 \
84+
libatk-bridge2.0-0 \
85+
libatk1.0-0 \
86+
libcups2 \
87+
libdbus-1-3 \
88+
libgdk-pixbuf2.0-0 \
89+
libnspr4 \
90+
libnss3 \
91+
libxcomposite1 \
92+
libxdamage1 \
93+
libxrandr2 \
94+
xdg-utils \
95+
--no-install-recommends
96+
97+
# deb 파일 직접 다운로드 및 설치
98+
# RUN wget https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_114.0.5735.90-1_amd64.deb && \
99+
# dpkg -i google-chrome-stable_114.0.5735.106-1_amd64.deb || true && \
100+
# apt-get install -f -y && \
101+
# rm google-chrome-stable_114.0.5735.106-1_amd64.deb
102+
103+
RUN wget https://mirror.cs.uchicago.edu/google-chrome/pool/main/g/google-chrome-stable/google-chrome-stable_114.0.5735.90-1_amd64.deb && \
104+
dpkg -i google-chrome-stable_114.0.5735.90-1_amd64.deb || true && \
105+
apt-get install -f -y && \
106+
rm google-chrome-stable_114.0.5735.90-1_amd64.deb
107+
108+
RUN wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip && \
109+
unzip chromedriver_linux64.zip && \
110+
mv chromedriver /usr/local/bin/ && \
111+
rm chromedriver_linux64.zip
112+
113+
USER airflow
114+
115+
COPY requirements.txt .
116+
RUN pip install --no-cache-dir -r requirements.txt
117+
118+
>>>>>>> 7da0b8b0ffb468686e450409f4f459806908966d
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
from seleniumbase import SB
2+
from bs4 import BeautifulSoup
3+
import datetime
4+
import time
5+
import os
6+
from airflow.utils.log.logging_mixin import LoggingMixin
7+
8+
os.environ["SB_OPTIONS"] = "--no-sandbox --disable-dev-shm-usage --disable-gpu"
9+
10+
def get_brand(brand_name, brand_code):
11+
log = LoggingMixin().log
12+
log.info(f"[get_brand] 시작: {brand_name} ({brand_code})")
13+
url = f"https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd={brand_code}"
14+
data = []
15+
goods_no_list = []
16+
collected_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
17+
18+
with SB(uc=True, test=True, headless=True) as sb:
19+
log.info(f"[get_brand] URL 오픈: {url}")
20+
sb.open(url)
21+
time.sleep(1)
22+
23+
page = 1
24+
while True:
25+
log.info(f"[get_brand] {page}페이지 크롤링 시작")
26+
if page > 1:
27+
try:
28+
sb.click(f"div.pageing a[data-page-no='{page}']")
29+
time.sleep(2)
30+
except Exception as e:
31+
log.warning(f"{page}페이지 버튼 클릭 실패 또는 더 이상 페이지 없음: {e}")
32+
break
33+
34+
html = sb.driver.page_source
35+
soup = BeautifulSoup(html, "html.parser")
36+
37+
# 브랜드명 추출
38+
try:
39+
brand = soup.select_one("h2.title-detail-brand").text.strip()
40+
except Exception:
41+
brand = brand_name
42+
43+
# 상품 목록 추출
44+
items = soup.select("ul.prod-list.goodsProd > li")
45+
log.info(f"[get_brand] {page}페이지 상품 개수: {len(items)}")
46+
if not items:
47+
log.info(f"{page}페이지에 상품이 없습니다. 종료")
48+
break
49+
50+
for item in items:
51+
is_pb = 1
52+
try:
53+
name = item.select_one("span.prod-name.double-line").text.strip()
54+
except Exception:
55+
name = ""
56+
try:
57+
a_tag = item.select_one("a[data-ref-goodsno]")
58+
goods_no = a_tag["data-ref-goodsno"] if a_tag else ""
59+
except Exception:
60+
goods_no = ""
61+
if goods_no:
62+
goods_no_list.append(goods_no)
63+
try:
64+
price_final = item.select_one("strong.total").text.strip().replace("원", "").replace(",", "").replace("~", "")
65+
except Exception:
66+
price_final = ""
67+
try:
68+
price_original = item.select_one("span.origin").text.strip().replace("원", "").replace(",", "")
69+
except Exception:
70+
price_original = ""
71+
try:
72+
flag_spans = item.select("div.flags span.flag")
73+
flag_list = [span.text.strip() for span in flag_spans if span.text.strip()]
74+
flag_str = ",".join(flag_list) if flag_list else ""
75+
except Exception:
76+
flag_str = ""
77+
try:
78+
soldout_flag = item.select_one("span.status_flag.soldout")
79+
is_soldout = bool(soldout_flag)
80+
except Exception:
81+
is_soldout = False
82+
83+
data.append({
84+
"brandName": brand,
85+
"isPB": is_pb,
86+
"goodsName": name,
87+
"salePrice": price_final,
88+
"originalPrice": price_original,
89+
"flagList": flag_str,
90+
"isSoldout": is_soldout,
91+
"createdAt": collected_at
92+
})
93+
log.info(f"[get_brand] {page}페이지 누적 상품 수: {len(data)}")
94+
page += 1
95+
96+
log.info(f"[get_brand] 크롤링 종료: 총 상품 {len(data)}개, goods_no {len(goods_no_list)}개")
97+
return data, goods_no_list
98+
99+
100+
def get_brand_product_detail_info(sb, goods_no: str) -> dict:
101+
log = LoggingMixin().log
102+
url = f"https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo={goods_no}"
103+
log.info(f"[get_brand_product_detail_info] 시작: goods_no={goods_no}")
104+
105+
try:
106+
sb.open(url)
107+
log.info(f"[get_brand_product_detail_info] URL 오픈: {url}")
108+
time.sleep(1)
109+
html = sb.driver.page_source
110+
soup = BeautifulSoup(html, 'html.parser')
111+
except Exception as e:
112+
log.error(f"[get_brand_product_detail_info] 페이지 오픈 실패: {e}")
113+
return {}
114+
115+
# 카테고리 추출
116+
try:
117+
category = soup.select_one("a.cate_y#midCatNm").text.strip()
118+
log.info(f"[get_brand_product_detail_info] 카테고리 추출 성공: {category}")
119+
except Exception:
120+
category = ""
121+
log.warning("[get_brand_product_detail_info] 카테고리 추출 실패")
122+
123+
# 총리뷰수
124+
try:
125+
review_info = soup.select_one("#repReview em")
126+
total_review = int(review_info.text.strip().replace("(", "").replace("건)", "").replace(",", ""))
127+
log.info(f"[get_brand_product_detail_info] 총 리뷰수: {total_review}")
128+
except Exception as e:
129+
log.warning(f"[get_brand_product_detail_info] 총 리뷰수 파싱 실패: {e}")
130+
total_review = 0
131+
# 리뷰평점
132+
try:
133+
review_score = soup.select_one("#repReview b")
134+
review_score = float(review_score.text.strip())
135+
log.info(f"[get_brand_product_detail_info] 리뷰평점: {review_score}")
136+
except Exception as e:
137+
log.warning(f"[get_brand_product_detail_info] 리뷰평점 파싱 실패: {e}")
138+
review_score = ""
139+
140+
# 리뷰 분포 기본값
141+
pctOf5 = pctOf4 = pctOf3 = pctOf2 = pctOf1 = None
142+
143+
# 리뷰가 1건 이상 있을 때만 리뷰탭 클릭 및 분포 수집
144+
total_comment = ""
145+
if total_review > 0:
146+
try:
147+
sb.click("a.goods_reputation")
148+
log.info("[get_brand_product_detail_info] 리뷰탭 클릭 성공")
149+
percent_elements = sb.find_elements("css selector", "ul.graph_list span.per")
150+
percent_list = [el.text.strip() for el in percent_elements]
151+
if len(percent_list) == 5:
152+
pctOf5 = percent_list[0]
153+
pctOf4 = percent_list[1]
154+
pctOf3 = percent_list[2]
155+
pctOf2 = percent_list[3]
156+
pctOf1 = percent_list[4]
157+
log.info(f"[get_brand_product_detail_info] 리뷰 분포: {percent_list}")
158+
159+
try:
160+
comment_tag = sb.find_element("css selector", "p.img_face em")
161+
total_comment = comment_tag.text.strip() if comment_tag else ""
162+
log.info(f"[get_brand_product_detail_info] 대표 코멘트 추출: {total_comment}")
163+
except Exception:
164+
total_comment = ""
165+
log.warning("[get_brand_product_detail_info] 대표 코멘트 추출 실패")
166+
except Exception as e:
167+
log.warning(f"[get_brand_product_detail_info] 리뷰 정보 수집 실패: {e}")
168+
else:
169+
log.warning("[get_product_detail_info] 리뷰 정보 없음: 리뷰 수가 0건 입니다.")
170+
171+
# === 상세스펙(구매정보) 추출 ===
172+
# 구매정보 탭 클릭
173+
try:
174+
sb.click("a.goods_buyinfo")
175+
time.sleep(1) # ajax 로딩 대기
176+
html = sb.driver.page_source
177+
soup = BeautifulSoup(html, 'html.parser')
178+
log.info("[get_brand_product_detail_info] 구매정보 탭 클릭 및 파싱 성공")
179+
except Exception as e:
180+
log.warning(f"[get_brand_product_detail_info] 구매정보 탭 클릭 실패: {e}")
181+
182+
# 용량, 주요사양, 성분 추출
183+
def get_detail_info(soup, title):
184+
try:
185+
dl_tags = soup.select("div#artcInfo dl.detail_info_list")
186+
for dl in dl_tags:
187+
dt = dl.select_one("dt")
188+
dd = dl.select_one("dd")
189+
if dt and dd:
190+
dt_text = dt.text.strip()
191+
dd_text = dd.text.strip()
192+
if title in dt_text:
193+
log.info(f"[get_brand_product_detail_info] {title} 추출 성공!")
194+
return dd_text
195+
except Exception as e:
196+
log.warning(f"[get_brand_product_detail_info] 상세 정보 파싱 실패 ({title}): {e}")
197+
return ""
198+
199+
# === reviewDetail 파싱 ===
200+
review_detail = []
201+
try:
202+
poll_div = soup.select_one("div.poll_all")
203+
if poll_div:
204+
for dl in poll_div.select("dl.poll_type2.type3"):
205+
type_name = dl.select_one("dt span")
206+
type_name = type_name.text.strip() if type_name else ""
207+
for li in dl.select("dd ul.list > li"):
208+
value = li.select_one("span.txt")
209+
value = value.text.strip() if value else ""
210+
gauge = li.select_one("em.per")
211+
gauge = gauge.text.strip() if gauge else ""
212+
review_detail.append({
213+
"gauge": gauge,
214+
"type": type_name,
215+
"value": value
216+
})
217+
log.info(f"[get_brand_product_detail_info] reviewDetail 파싱 성공: {review_detail}")
218+
except Exception as e:
219+
log.warning(f"[get_brand_product_detail_info] reviewDetail 파싱 실패: {e}")
220+
221+
# 상세스펙 정보 추출
222+
detail_spec = {}
223+
spec_map = {
224+
"용량": "capacity",
225+
"주요 사양": "detail",
226+
"성분": "ingredient"
227+
}
228+
for title, key in spec_map.items():
229+
detail_spec[key] = get_detail_info(soup, title)
230+
231+
return {
232+
"category": category,
233+
"totalComment": total_comment,
234+
"numOfReviews": total_review,
235+
"avgReview": review_score,
236+
"pctOf5": pctOf5,
237+
"pctOf4": pctOf4,
238+
"pctOf3": pctOf3,
239+
"pctOf2": pctOf2,
240+
"pctOf1": pctOf1,
241+
"reviewDetail": review_detail,
242+
**detail_spec,
243+
}
244+
245+
##### 실행 코드 #####
246+
# PB_BRAND_CODE_DICT = {
247+
# "바이오힐 보": "A000897",
248+
# "브링그린": "A002253",
249+
# "웨이크메이크": "A001240",
250+
# "컬러그램": "A002712",
251+
# "필리밀리": "A002502",
252+
# "아이디얼포맨": "A001643",
253+
# "라운드어라운드": "A001306",
254+
# "식물나라": "A000036",
255+
# "케어플러스": "A003339",
256+
# "탄탄": "A015673",
257+
# "딜라이트 프로젝트": "A003361",
258+
# }
259+
260+
# for brand_name, brand_code in PB_BRAND_CODE_DICT.items():
261+
# df = get_brand(brand_name, brand_code)
262+
263+
# with SB(uc=True, test=True) as sb:
264+
# detail_list = []
265+
# for goods_no in df['goodsNo']:
266+
# detail = get_brand_product_detail_info(sb, goods_no)
267+
# detail_list.append(detail)
268+
269+
# detail_df = pd.DataFrame(detail_list)
270+
# result_df = pd.concat([df.reset_index(drop=True), detail_df.reset_index(drop=True)], axis=1)
271+
272+
# result_df.to_json('skincare_result.json', orient='records', force_ascii=False, indent=2)

0 commit comments

Comments
 (0)