Merge pull request #54 from 1wos/v2-main

robertchoi · web-flow · commit bccc15572af4 · 2026-02-11T15:57:06.000+09:00
fix: 링크 키워드 추출 문제 해결
diff --git a/api/routes/tools.py b/api/routes/tools.py
@@ -1,7 +1,6 @@
 """Utility and Tool API endpoints."""
 
 import logging
-from typing import Optional
 
 from fastapi import APIRouter, HTTPException, Query
 
diff --git a/casts/blog_writer/modules/agents.py b/casts/blog_writer/modules/agents.py
@@ -13,7 +13,6 @@
 from typing import Optional
 
 from langchain.agents import create_agent
-from langchain_core.language_models import BaseChatModel
 
 from .models import get_llm
 from .state import LLMProvider
diff --git a/casts/blog_writer/modules/nodes.py b/casts/blog_writer/modules/nodes.py
@@ -13,6 +13,7 @@
 
 import json
 import re
+from typing import Any, Optional
 
 import markdown
 
@@ -33,6 +34,50 @@
 from casts.blog_writer.modules.tools import fetch_content, generate_image
 
 
+def _extract_json(text: str) -> Optional[Any]:
+    """Extract JSON from LLM response text.
+
+    Handles multiple formats:
+    1. ```json ... ``` code blocks
+    2. ``` ... ``` code blocks without language tag
+    3. Raw JSON object {...}
+    4. Raw JSON array [...]
+    """
+    # Try code block with json tag
+    json_match = re.search(r"```json\s*([\s\S]*?)```", text)
+    if json_match:
+        try:
+            return json.loads(json_match.group(1).strip())
+        except json.JSONDecodeError:
+            pass
+
+    # Try code block without tag
+    json_match = re.search(r"```\s*([\s\S]*?)```", text)
+    if json_match:
+        try:
+            return json.loads(json_match.group(1).strip())
+        except json.JSONDecodeError:
+            pass
+
+    # Try to find raw JSON object
+    json_match = re.search(r"\{[\s\S]*\}", text)
+    if json_match:
+        try:
+            return json.loads(json_match.group(0))
+        except json.JSONDecodeError:
+            pass
+
+    # Try to find raw JSON array
+    json_match = re.search(r"\[[\s\S]*\]", text)
+    if json_match:
+        try:
+            return json.loads(json_match.group(0))
+        except json.JSONDecodeError:
+            pass
+
+    return None
+
+
 class FetchContent(AsyncBaseNode):
     """URL에서 웹 콘텐츠 수집 (BS4/Playwright)."""
 
@@ -79,15 +124,8 @@ async def execute(self, state, config=None):
         self.log("컨텐츠 분석 중...")
         response = await llm.ainvoke(prompt)
 
-        try:
-            # Parse JSON from response
-            content = response.content
-            # Extract JSON from markdown code block if present
-            json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
-            if json_match:
-                content = json_match.group(1)
-            analyzed_content = json.loads(content)
-        except json.JSONDecodeError:
+        analyzed_content = _extract_json(response.content)
+        if not analyzed_content:
             # Fallback structure
             analyzed_content = {
                 "title": "Untitled",
@@ -134,14 +172,13 @@ async def execute(self, state, config=None):
         self.log("키워드 추천 중...")
         response = await llm.ainvoke(prompt)
 
-        try:
-            content = response.content
-            json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
-            if json_match:
-                content = json_match.group(1)
-            data = json.loads(content)
+        data = _extract_json(response.content)
+        if data and isinstance(data, dict) and "keywords" in data:
             suggested_keywords = data.get("keywords", [])[:30]
-        except json.JSONDecodeError:
+        elif data and isinstance(data, list):
+            # Handle case where LLM returns just an array
+            suggested_keywords = data[:30]
+        else:
             # Fallback: extract any quoted words
             suggested_keywords = re.findall(r'"([^"]+)"', response.content)[:30]
 
@@ -226,13 +263,8 @@ async def execute(self, state, config=None):
         self.log("SEO 최적화 중...")
         response = await llm.ainvoke(prompt)
 
-        try:
-            content = response.content
-            json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
-            if json_match:
-                content = json_match.group(1)
-            seo_meta = json.loads(content)
-        except json.JSONDecodeError:
+        seo_meta = _extract_json(response.content)
+        if not seo_meta:
             seo_meta = {
                 "title": "Blog Post",
                 "description": blog_markdown[:160],
diff --git a/casts/blog_writer/modules/prompts.py b/casts/blog_writer/modules/prompts.py
@@ -38,6 +38,7 @@
 - SEO에 효과적인 키워드
 - 검색량이 높을 것으로 예상되는 키워드
 - 콘텐츠 주제와 밀접하게 관련된 키워드
+- 반드시 한글 키워드로 제안 (영문 브랜드명/기술명은 그대로 사용 가능)
 
 JSON 형식으로 키워드를 리스트로 반환하세요 (최대 30개):
 {{"keywords": ["키워드1", "키워드2", ...]}}"""
diff --git a/casts/blog_writer/modules/tools.py b/casts/blog_writer/modules/tools.py
@@ -60,8 +60,12 @@ async def fetch_with_playwright(url: str) -> str:
 
         await page.goto(url, wait_until="networkidle")
 
-        # Get main content
-        content = await page.content()
+        # Handle Naver blog iframe
+        if "blog.naver.com" in url:
+            content = await _extract_naver_blog_content(page)
+        else:
+            content = await page.content()
+
         await browser.close()
 
     soup = BeautifulSoup(content, "html.parser")
@@ -74,10 +78,36 @@ async def fetch_with_playwright(url: str) -> str:
     return "\n".join(lines)
 
 
+async def _extract_naver_blog_content(page) -> str:
+    """Extract content from Naver blog iframe.
+
+    Naver blogs load actual content inside an iframe.
+    """
+    try:
+        # Wait for iframe to load
+        iframe_element = await page.wait_for_selector(
+            "iframe#mainFrame", timeout=10000
+        )
+        if iframe_element:
+            frame = await iframe_element.content_frame()
+            if frame:
+                # Wait for content to render inside iframe
+                await frame.wait_for_selector(
+                    ".se-main-container, .post-view, #postViewArea",
+                    timeout=10000,
+                )
+                return await frame.content()
+    except Exception:
+        pass
+
+    # Fallback to main page content
+    return await page.content()
+
+
 async def fetch_content(
     url: str, scraper_type: ScraperType = ScraperType.BEAUTIFULSOUP
 ) -> str:
-    """Fetch web content using configured scraper.
+    """Fetch web content using configured scraper with fallback support.
 
     Args:
         url: URL to fetch
@@ -86,9 +116,35 @@ async def fetch_content(
     Returns:
         Extracted text content
     """
+    # If explicitly requested Playwright, use it directly
     if scraper_type == ScraperType.PLAYWRIGHT:
         return await fetch_with_playwright(url)
-    return await fetch_with_beautifulsoup(url)
+
+    # Try BeautifulSoup first
+    content = await fetch_with_beautifulsoup(url)
+
+    # Fallback to Playwright if content seems insufficient
+    # (JS-rendered sites like Naver blog return minimal content with BS4)
+    if len(content) < 200 or _is_js_rendered_site(url):
+        try:
+            playwright_content = await fetch_with_playwright(url)
+            if len(playwright_content) > len(content):
+                return playwright_content
+        except Exception:
+            pass  # Stick with BS4 content if Playwright fails
+
+    return content
+
+
+def _is_js_rendered_site(url: str) -> bool:
+    """Check if URL is known to require JavaScript rendering."""
+    js_sites = [
+        "blog.naver.com",
+        "m.blog.naver.com",
+        "post.naver.com",
+        "brunch.co.kr",
+    ]
+    return any(site in url for site in js_sites)
 
 
 # =============================================================================
diff --git a/casts/chat/modules/models.py b/casts/chat/modules/models.py
@@ -12,7 +12,6 @@
     - Embedding Models: https://docs.langchain.com/oss/python/integrations/text_embedding
 """
 
-from typing import Optional
 
 from langchain.chat_models import init_chat_model
 from langchain_core.language_models import BaseChatModel
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "langchain-openai>=1.1.6",
     "langgraph>=1.0.0",
     "markdown>=3.10",
+    "playwright>=1.58.0",
     "pydantic-settings>=2.1.0",
     "python-dotenv>=1.0.1",
     "uvicorn[standard]>=0.34.0",
diff --git a/uv.lock b/uv.lock