Skip to content

Commit bccc155

Browse files
authored
Merge pull request #54 from 1wos/v2-main
fix: 링크 키워드 추출 문제 해결
2 parents 0bddb90 + b89c2a0 commit bccc155

8 files changed

Lines changed: 202 additions & 30 deletions

File tree

api/routes/tools.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Utility and Tool API endpoints."""
22

33
import logging
4-
from typing import Optional
54

65
from fastapi import APIRouter, HTTPException, Query
76

casts/blog_writer/modules/agents.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from typing import Optional
1414

1515
from langchain.agents import create_agent
16-
from langchain_core.language_models import BaseChatModel
1716

1817
from .models import get_llm
1918
from .state import LLMProvider

casts/blog_writer/modules/nodes.py

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import json
1515
import re
16+
from typing import Any, Optional
1617

1718
import markdown
1819

@@ -33,6 +34,50 @@
3334
from casts.blog_writer.modules.tools import fetch_content, generate_image
3435

3536

37+
def _extract_json(text: str) -> Optional[Any]:
38+
"""Extract JSON from LLM response text.
39+
40+
Handles multiple formats:
41+
1. ```json ... ``` code blocks
42+
2. ``` ... ``` code blocks without language tag
43+
3. Raw JSON object {...}
44+
4. Raw JSON array [...]
45+
"""
46+
# Try code block with json tag
47+
json_match = re.search(r"```json\s*([\s\S]*?)```", text)
48+
if json_match:
49+
try:
50+
return json.loads(json_match.group(1).strip())
51+
except json.JSONDecodeError:
52+
pass
53+
54+
# Try code block without tag
55+
json_match = re.search(r"```\s*([\s\S]*?)```", text)
56+
if json_match:
57+
try:
58+
return json.loads(json_match.group(1).strip())
59+
except json.JSONDecodeError:
60+
pass
61+
62+
# Try to find raw JSON object
63+
json_match = re.search(r"\{[\s\S]*\}", text)
64+
if json_match:
65+
try:
66+
return json.loads(json_match.group(0))
67+
except json.JSONDecodeError:
68+
pass
69+
70+
# Try to find raw JSON array
71+
json_match = re.search(r"\[[\s\S]*\]", text)
72+
if json_match:
73+
try:
74+
return json.loads(json_match.group(0))
75+
except json.JSONDecodeError:
76+
pass
77+
78+
return None
79+
80+
3681
class FetchContent(AsyncBaseNode):
3782
"""URL에서 웹 콘텐츠 수집 (BS4/Playwright)."""
3883

@@ -79,15 +124,8 @@ async def execute(self, state, config=None):
79124
self.log("컨텐츠 분석 중...")
80125
response = await llm.ainvoke(prompt)
81126

82-
try:
83-
# Parse JSON from response
84-
content = response.content
85-
# Extract JSON from markdown code block if present
86-
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
87-
if json_match:
88-
content = json_match.group(1)
89-
analyzed_content = json.loads(content)
90-
except json.JSONDecodeError:
127+
analyzed_content = _extract_json(response.content)
128+
if not analyzed_content:
91129
# Fallback structure
92130
analyzed_content = {
93131
"title": "Untitled",
@@ -134,14 +172,13 @@ async def execute(self, state, config=None):
134172
self.log("키워드 추천 중...")
135173
response = await llm.ainvoke(prompt)
136174

137-
try:
138-
content = response.content
139-
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
140-
if json_match:
141-
content = json_match.group(1)
142-
data = json.loads(content)
175+
data = _extract_json(response.content)
176+
if data and isinstance(data, dict) and "keywords" in data:
143177
suggested_keywords = data.get("keywords", [])[:30]
144-
except json.JSONDecodeError:
178+
elif data and isinstance(data, list):
179+
# Handle case where LLM returns just an array
180+
suggested_keywords = data[:30]
181+
else:
145182
# Fallback: extract any quoted words
146183
suggested_keywords = re.findall(r'"([^"]+)"', response.content)[:30]
147184

@@ -226,13 +263,8 @@ async def execute(self, state, config=None):
226263
self.log("SEO 최적화 중...")
227264
response = await llm.ainvoke(prompt)
228265

229-
try:
230-
content = response.content
231-
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
232-
if json_match:
233-
content = json_match.group(1)
234-
seo_meta = json.loads(content)
235-
except json.JSONDecodeError:
266+
seo_meta = _extract_json(response.content)
267+
if not seo_meta:
236268
seo_meta = {
237269
"title": "Blog Post",
238270
"description": blog_markdown[:160],

casts/blog_writer/modules/prompts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
- SEO에 효과적인 키워드
3939
- 검색량이 높을 것으로 예상되는 키워드
4040
- 콘텐츠 주제와 밀접하게 관련된 키워드
41+
- 반드시 한글 키워드로 제안 (영문 브랜드명/기술명은 그대로 사용 가능)
4142
4243
JSON 형식으로 키워드를 리스트로 반환하세요 (최대 30개):
4344
{{"keywords": ["키워드1", "키워드2", ...]}}"""

casts/blog_writer/modules/tools.py

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,12 @@ async def fetch_with_playwright(url: str) -> str:
6060

6161
await page.goto(url, wait_until="networkidle")
6262

63-
# Get main content
64-
content = await page.content()
63+
# Handle Naver blog iframe
64+
if "blog.naver.com" in url:
65+
content = await _extract_naver_blog_content(page)
66+
else:
67+
content = await page.content()
68+
6569
await browser.close()
6670

6771
soup = BeautifulSoup(content, "html.parser")
@@ -74,10 +78,36 @@ async def fetch_with_playwright(url: str) -> str:
7478
return "\n".join(lines)
7579

7680

81+
async def _extract_naver_blog_content(page) -> str:
82+
"""Extract content from Naver blog iframe.
83+
84+
Naver blogs load actual content inside an iframe.
85+
"""
86+
try:
87+
# Wait for iframe to load
88+
iframe_element = await page.wait_for_selector(
89+
"iframe#mainFrame", timeout=10000
90+
)
91+
if iframe_element:
92+
frame = await iframe_element.content_frame()
93+
if frame:
94+
# Wait for content to render inside iframe
95+
await frame.wait_for_selector(
96+
".se-main-container, .post-view, #postViewArea",
97+
timeout=10000,
98+
)
99+
return await frame.content()
100+
except Exception:
101+
pass
102+
103+
# Fallback to main page content
104+
return await page.content()
105+
106+
77107
async def fetch_content(
78108
url: str, scraper_type: ScraperType = ScraperType.BEAUTIFULSOUP
79109
) -> str:
80-
"""Fetch web content using configured scraper.
110+
"""Fetch web content using configured scraper with fallback support.
81111
82112
Args:
83113
url: URL to fetch
@@ -86,9 +116,35 @@ async def fetch_content(
86116
Returns:
87117
Extracted text content
88118
"""
119+
# If explicitly requested Playwright, use it directly
89120
if scraper_type == ScraperType.PLAYWRIGHT:
90121
return await fetch_with_playwright(url)
91-
return await fetch_with_beautifulsoup(url)
122+
123+
# Try BeautifulSoup first
124+
content = await fetch_with_beautifulsoup(url)
125+
126+
# Fallback to Playwright if content seems insufficient
127+
# (JS-rendered sites like Naver blog return minimal content with BS4)
128+
if len(content) < 200 or _is_js_rendered_site(url):
129+
try:
130+
playwright_content = await fetch_with_playwright(url)
131+
if len(playwright_content) > len(content):
132+
return playwright_content
133+
except Exception:
134+
pass # Stick with BS4 content if Playwright fails
135+
136+
return content
137+
138+
139+
def _is_js_rendered_site(url: str) -> bool:
140+
"""Check if URL is known to require JavaScript rendering."""
141+
js_sites = [
142+
"blog.naver.com",
143+
"m.blog.naver.com",
144+
"post.naver.com",
145+
"brunch.co.kr",
146+
]
147+
return any(site in url for site in js_sites)
92148

93149

94150
# =============================================================================

casts/chat/modules/models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
- Embedding Models: https://docs.langchain.com/oss/python/integrations/text_embedding
1313
"""
1414

15-
from typing import Optional
1615

1716
from langchain.chat_models import init_chat_model
1817
from langchain_core.language_models import BaseChatModel

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ dependencies = [
1414
"langchain-openai>=1.1.6",
1515
"langgraph>=1.0.0",
1616
"markdown>=3.10",
17+
"playwright>=1.58.0",
1718
"pydantic-settings>=2.1.0",
1819
"python-dotenv>=1.0.1",
1920
"uvicorn[standard]>=0.34.0",

0 commit comments

Comments
 (0)