1+ from seleniumbase import SB
2+ from bs4 import BeautifulSoup
3+ import datetime
4+ import time
5+ import os
6+ from airflow .utils .log .logging_mixin import LoggingMixin
7+
8+ os .environ ["SB_OPTIONS" ] = "--no-sandbox --disable-dev-shm-usage --disable-gpu"
9+
10+ def get_brand (brand_name , brand_code ):
11+ log = LoggingMixin ().log
12+ log .info (f"[get_brand] 시작: { brand_name } ({ brand_code } )" )
13+ url = f"https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd={ brand_code } "
14+ data = []
15+ goods_no_list = []
16+ collected_at = datetime .datetime .now ().strftime ("%Y-%m-%d %H:%M:%S" )
17+
18+ with SB (uc = True , test = True , headless = True ) as sb :
19+ log .info (f"[get_brand] URL 오픈: { url } " )
20+ sb .open (url )
21+ time .sleep (1 )
22+
23+ page = 1
24+ while True :
25+ log .info (f"[get_brand] { page } 페이지 크롤링 시작" )
26+ if page > 1 :
27+ try :
28+ sb .click (f"div.pageing a[data-page-no='{ page } ']" )
29+ time .sleep (2 )
30+ except Exception as e :
31+ log .warning (f"{ page } 페이지 버튼 클릭 실패 또는 더 이상 페이지 없음: { e } " )
32+ break
33+
34+ html = sb .driver .page_source
35+ soup = BeautifulSoup (html , "html.parser" )
36+
37+ # 브랜드명 추출
38+ try :
39+ brand = soup .select_one ("h2.title-detail-brand" ).text .strip ()
40+ except Exception :
41+ brand = brand_name
42+
43+ # 상품 목록 추출
44+ items = soup .select ("ul.prod-list.goodsProd > li" )
45+ log .info (f"[get_brand] { page } 페이지 상품 개수: { len (items )} " )
46+ if not items :
47+ log .info (f"{ page } 페이지에 상품이 없습니다. 종료" )
48+ break
49+
50+ for item in items :
51+ is_pb = 1
52+ try :
53+ name = item .select_one ("span.prod-name.double-line" ).text .strip ()
54+ except Exception :
55+ name = ""
56+ try :
57+ a_tag = item .select_one ("a[data-ref-goodsno]" )
58+ goods_no = a_tag ["data-ref-goodsno" ] if a_tag else ""
59+ except Exception :
60+ goods_no = ""
61+ if goods_no :
62+ goods_no_list .append (goods_no )
63+ try :
64+ price_final = item .select_one ("strong.total" ).text .strip ().replace ("원" , "" ).replace ("," , "" ).replace ("~" , "" )
65+ except Exception :
66+ price_final = ""
67+ try :
68+ price_original = item .select_one ("span.origin" ).text .strip ().replace ("원" , "" ).replace ("," , "" )
69+ except Exception :
70+ price_original = ""
71+ try :
72+ flag_spans = item .select ("div.flags span.flag" )
73+ flag_list = [span .text .strip () for span in flag_spans if span .text .strip ()]
74+ flag_str = "," .join (flag_list ) if flag_list else ""
75+ except Exception :
76+ flag_str = ""
77+ try :
78+ soldout_flag = item .select_one ("span.status_flag.soldout" )
79+ is_soldout = bool (soldout_flag )
80+ except Exception :
81+ is_soldout = False
82+
83+ data .append ({
84+ "brandName" : brand ,
85+ "isPB" : is_pb ,
86+ "goodsName" : name ,
87+ "salePrice" : price_final ,
88+ "originalPrice" : price_original ,
89+ "flagList" : flag_str ,
90+ "isSoldout" : is_soldout ,
91+ "createdAt" : collected_at
92+ })
93+ log .info (f"[get_brand] { page } 페이지 누적 상품 수: { len (data )} " )
94+ page += 1
95+
96+ log .info (f"[get_brand] 크롤링 종료: 총 상품 { len (data )} 개, goods_no { len (goods_no_list )} 개" )
97+ return data , goods_no_list
98+
99+
100+ def get_brand_product_detail_info (sb , goods_no : str ) -> dict :
101+ log = LoggingMixin ().log
102+ url = f"https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo={ goods_no } "
103+ log .info (f"[get_brand_product_detail_info] 시작: goods_no={ goods_no } " )
104+
105+ try :
106+ sb .open (url )
107+ log .info (f"[get_brand_product_detail_info] URL 오픈: { url } " )
108+ time .sleep (1 )
109+ html = sb .driver .page_source
110+ soup = BeautifulSoup (html , 'html.parser' )
111+ except Exception as e :
112+ log .error (f"[get_brand_product_detail_info] 페이지 오픈 실패: { e } " )
113+ return {}
114+
115+ # 카테고리 추출
116+ try :
117+ category = soup .select_one ("a.cate_y#midCatNm" ).text .strip ()
118+ log .info (f"[get_brand_product_detail_info] 카테고리 추출 성공: { category } " )
119+ except Exception :
120+ category = ""
121+ log .warning ("[get_brand_product_detail_info] 카테고리 추출 실패" )
122+
123+ # 총리뷰수
124+ try :
125+ review_info = soup .select_one ("#repReview em" )
126+ total_review = int (review_info .text .strip ().replace ("(" , "" ).replace ("건)" , "" ).replace ("," , "" ))
127+ log .info (f"[get_brand_product_detail_info] 총 리뷰수: { total_review } " )
128+ except Exception as e :
129+ log .warning (f"[get_brand_product_detail_info] 총 리뷰수 파싱 실패: { e } " )
130+ total_review = 0
131+ # 리뷰평점
132+ try :
133+ review_score = soup .select_one ("#repReview b" )
134+ review_score = float (review_score .text .strip ())
135+ log .info (f"[get_brand_product_detail_info] 리뷰평점: { review_score } " )
136+ except Exception as e :
137+ log .warning (f"[get_brand_product_detail_info] 리뷰평점 파싱 실패: { e } " )
138+ review_score = ""
139+
140+ # 리뷰 분포 기본값
141+ pctOf5 = pctOf4 = pctOf3 = pctOf2 = pctOf1 = None
142+
143+ # 리뷰가 1건 이상 있을 때만 리뷰탭 클릭 및 분포 수집
144+ total_comment = ""
145+ if total_review > 0 :
146+ try :
147+ sb .click ("a.goods_reputation" )
148+ log .info ("[get_brand_product_detail_info] 리뷰탭 클릭 성공" )
149+ percent_elements = sb .find_elements ("css selector" , "ul.graph_list span.per" )
150+ percent_list = [el .text .strip () for el in percent_elements ]
151+ if len (percent_list ) == 5 :
152+ pctOf5 = percent_list [0 ]
153+ pctOf4 = percent_list [1 ]
154+ pctOf3 = percent_list [2 ]
155+ pctOf2 = percent_list [3 ]
156+ pctOf1 = percent_list [4 ]
157+ log .info (f"[get_brand_product_detail_info] 리뷰 분포: { percent_list } " )
158+
159+ try :
160+ comment_tag = sb .find_element ("css selector" , "p.img_face em" )
161+ total_comment = comment_tag .text .strip () if comment_tag else ""
162+ log .info (f"[get_brand_product_detail_info] 대표 코멘트 추출: { total_comment } " )
163+ except Exception :
164+ total_comment = ""
165+ log .warning ("[get_brand_product_detail_info] 대표 코멘트 추출 실패" )
166+ except Exception as e :
167+ log .warning (f"[get_brand_product_detail_info] 리뷰 정보 수집 실패: { e } " )
168+ else :
169+ log .warning ("[get_product_detail_info] 리뷰 정보 없음: 리뷰 수가 0건 입니다." )
170+
171+ # === 상세스펙(구매정보) 추출 ===
172+ # 구매정보 탭 클릭
173+ try :
174+ sb .click ("a.goods_buyinfo" )
175+ time .sleep (1 ) # ajax 로딩 대기
176+ html = sb .driver .page_source
177+ soup = BeautifulSoup (html , 'html.parser' )
178+ log .info ("[get_brand_product_detail_info] 구매정보 탭 클릭 및 파싱 성공" )
179+ except Exception as e :
180+ log .warning (f"[get_brand_product_detail_info] 구매정보 탭 클릭 실패: { e } " )
181+
182+ # 용량, 주요사양, 성분 추출
183+ def get_detail_info (soup , title ):
184+ try :
185+ dl_tags = soup .select ("div#artcInfo dl.detail_info_list" )
186+ for dl in dl_tags :
187+ dt = dl .select_one ("dt" )
188+ dd = dl .select_one ("dd" )
189+ if dt and dd :
190+ dt_text = dt .text .strip ()
191+ dd_text = dd .text .strip ()
192+ if title in dt_text :
193+ log .info (f"[get_brand_product_detail_info] { title } 추출 성공!" )
194+ return dd_text
195+ except Exception as e :
196+ log .warning (f"[get_brand_product_detail_info] 상세 정보 파싱 실패 ({ title } ): { e } " )
197+ return ""
198+
199+ # === reviewDetail 파싱 ===
200+ review_detail = []
201+ try :
202+ poll_div = soup .select_one ("div.poll_all" )
203+ if poll_div :
204+ for dl in poll_div .select ("dl.poll_type2.type3" ):
205+ type_name = dl .select_one ("dt span" )
206+ type_name = type_name .text .strip () if type_name else ""
207+ for li in dl .select ("dd ul.list > li" ):
208+ value = li .select_one ("span.txt" )
209+ value = value .text .strip () if value else ""
210+ gauge = li .select_one ("em.per" )
211+ gauge = gauge .text .strip () if gauge else ""
212+ review_detail .append ({
213+ "gauge" : gauge ,
214+ "type" : type_name ,
215+ "value" : value
216+ })
217+ log .info (f"[get_brand_product_detail_info] reviewDetail 파싱 성공: { review_detail } " )
218+ except Exception as e :
219+ log .warning (f"[get_brand_product_detail_info] reviewDetail 파싱 실패: { e } " )
220+
221+ # 상세스펙 정보 추출
222+ detail_spec = {}
223+ spec_map = {
224+ "용량" : "capacity" ,
225+ "주요 사양" : "detail" ,
226+ "성분" : "ingredient"
227+ }
228+ for title , key in spec_map .items ():
229+ detail_spec [key ] = get_detail_info (soup , title )
230+
231+ return {
232+ "category" : category ,
233+ "totalComment" : total_comment ,
234+ "numOfReviews" : total_review ,
235+ "avgReview" : review_score ,
236+ "pctOf5" : pctOf5 ,
237+ "pctOf4" : pctOf4 ,
238+ "pctOf3" : pctOf3 ,
239+ "pctOf2" : pctOf2 ,
240+ "pctOf1" : pctOf1 ,
241+ "reviewDetail" : review_detail ,
242+ ** detail_spec ,
243+ }
244+
245+ ##### 실행 코드 #####
246+ # PB_BRAND_CODE_DICT = {
247+ # "바이오힐 보": "A000897",
248+ # "브링그린": "A002253",
249+ # "웨이크메이크": "A001240",
250+ # "컬러그램": "A002712",
251+ # "필리밀리": "A002502",
252+ # "아이디얼포맨": "A001643",
253+ # "라운드어라운드": "A001306",
254+ # "식물나라": "A000036",
255+ # "케어플러스": "A003339",
256+ # "탄탄": "A015673",
257+ # "딜라이트 프로젝트": "A003361",
258+ # }
259+
260+ # for brand_name, brand_code in PB_BRAND_CODE_DICT.items():
261+ # df = get_brand(brand_name, brand_code)
262+
263+ # with SB(uc=True, test=True) as sb:
264+ # detail_list = []
265+ # for goods_no in df['goodsNo']:
266+ # detail = get_brand_product_detail_info(sb, goods_no)
267+ # detail_list.append(detail)
268+
269+ # detail_df = pd.DataFrame(detail_list)
270+ # result_df = pd.concat([df.reset_index(drop=True), detail_df.reset_index(drop=True)], axis=1)
271+
272+ # result_df.to_json('skincare_result.json', orient='records', force_ascii=False, indent=2)
0 commit comments