Refactor code structure for improved readability and maintainability

legout · legout · commit 73a8b4e2173f · 2025-08-06T19:27:56.000+02:00
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -0,0 +1,60 @@
+name: Publish Python 🐍 distribution to PyPI on version bump
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'pyproject.toml'
+
+jobs:
+  publish-to-pypi:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install uv
+        run: |
+          pip install uv
+
+      - name: Install build dependencies with uv
+        run: |
+          uv pip install build toml requests packaging
+
+      - name: Get current version from pyproject.toml
+        id: get_version
+        run: |
+          python -c "import toml; print(toml.load('pyproject.toml')['project']['version'])" > version.txt
+          echo "version=$(cat version.txt)" >> $GITHUB_OUTPUT
+
+      - name: Get latest version from PyPI
+        id: get_latest
+        run: |
+          PKG_NAME=$(python -c "import toml; print(toml.load('pyproject.toml')['project']['name'])")
+          curl -s https://pypi.org/pypi/$PKG_NAME/json | \
+            python -c "import sys, json; print(json.load(sys.stdin)['info']['version']) if sys.stdin.read() else print('0.0.0')" > latest.txt
+          echo "latest=$(cat latest.txt)" >> $GITHUB_OUTPUT
+
+      - name: Compare versions and set publish flag
+        id: check_version
+        run: |
+          python -c "from packaging.version import parse; import os; v1 = parse(os.environ['VERSION']); v2 = parse(os.environ['LATEST']); print('::set-output name=publish::' + str(v1 > v2))" \
+            VERSION=${{ steps.get_version.outputs.version }} LATEST=${{ steps.get_latest.outputs.latest }} > publish.txt
+          echo "publish=$(cat publish.txt | tail -n1 | cut -d'=' -f2)" >> $GITHUB_OUTPUT
+
+      - name: Build package
+        if: steps.check_version.outputs.publish == 'True'
+        run: uv pip run -m build
+
+      - name: Publish to PyPI
+        if: steps.check_version.outputs.publish == 'True'
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: ${{ secrets.PYPI_USERNAME }}
+          password: ${{ secrets.PYPI_PASSWORD }}
+          # Alternatively, use 'password: ${{ secrets.PYPI_API_TOKEN }}' if using an API token
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,7 @@ analysis_output/
 dev/
 analysis_ba_daniel_grundlagen2/
 analysis_ba_daniel_grundlagen*/**
-output/
+output/
+ENHANCEMENTS.md
+CLAUDE.md
+ProjectPlan.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,7 @@ build-backend = "hatchling.build"
 
 [dependency-groups]
 dev = [
+    "ipykernel>=6.30.1",
     "marimo>=0.14.16",
     "pdf2text>=1.0.0",
 ]
diff --git a/src/veritascribe/config.py b/src/veritascribe/config.py
@@ -136,21 +136,29 @@ def format_model_name(self, model_name: Optional[str] = None) -> str:
         return model
     
     def get_provider_specific_max_tokens(self) -> int:
-        """Get provider-specific max token limits optimized for each provider."""
-        provider_token_limits = {
-            "openai": min(self.max_tokens, 4000),  # OpenAI models generally handle larger contexts well
-            "openrouter": min(self.max_tokens, 3000),  # More conservative for free/cheaper models
-            "anthropic": min(self.max_tokens, 4000),  # Claude handles large contexts well
-            "custom": min(self.max_tokens, 4000)  # Conservative for unknown endpoints
+        """Get provider-specific max token limits, respecting user configuration."""
+        # Use user's max_tokens as the primary value
+        user_max_tokens = self.max_tokens
+        
+        # Define reasonable upper bounds per provider (only used as safety caps)
+        provider_max_caps = {
+            "openai": 8000,  # OpenAI models generally handle larger contexts well
+            "openrouter": 8000,  # More conservative for free/cheaper models
+            "anthropic": 4000,  # Claude handles large contexts well
+            "custom": 8000  # Conservative for unknown endpoints
         }
         
+        # Apply provider-specific cap only if user's setting exceeds it
+        provider_cap = provider_max_caps.get(self.llm_provider, 8000)
+        base_tokens = min(user_max_tokens, provider_cap)
+        
         # For specific known problematic models, use even lower limits
         formatted_model = self.format_model_name()
         if "free" in formatted_model.lower() or "air" in formatted_model.lower():
             # Free models often have quality issues with large contexts
-            return min(provider_token_limits.get(self.llm_provider, self.max_tokens), 2000)
+            return min(base_tokens, 8000)
         
-        return provider_token_limits.get(self.llm_provider, self.max_tokens)
+        return base_tokens
 
 
 # Provider-specific model configurations
diff --git a/src/veritascribe/pdf_processor.py b/src/veritascribe/pdf_processor.py
@@ -159,29 +159,27 @@ def _fix_german_umlauts(self, text: str) -> str:
         
        # Replace '"a' with 'ä' and '"A' with 'Ä'
         text = re.sub(r'"a', 'ä', text)
+        text = re.sub(r'¨a', 'ä', text)
+        text = re.sub(r'“a', 'ä', text)
         text = re.sub(r'"A', 'Ä', text)
+        text = re.sub(r'¨A', 'Ä', text)
+        text = re.sub(r'“A', 'Ä', text)
 
         # Replace '"o' with 'ö' and '"O' with 'Ö'
         text = re.sub(r'"o', 'ö', text)
+        text = re.sub(r'¨o', 'ö', text)
+        text = re.sub(r'“o', 'ö', text)
         text = re.sub(r'"O', 'Ö', text)
+        text = re.sub(r'¨O', 'Ö', text)
+        text = re.sub(r'“O', 'Ö', text)
 
         # Replace '"u' with 'ü' and '"U' with 'Ü'
         text = re.sub(r'"u', 'ü', text)
-        text = re.sub(r'"U', 'Ü', text)
-
-        # Replace '¨a' with 'ä' and '¨A' with 'Ä'
-        text = re.sub(r'¨a', 'ä', text)
-        text = re.sub(r'¨A', 'Ä', text)
-
-        # Replace '¨o' with 'ö' and '¨O' with 'Ö'
-        text = re.sub(r'¨o', 'ö', text)
-        text = re.sub(r'¨O', 'Ö', text)
-
-        # Replace '¨u' with 'ü' and '¨U' with 'Ü'
         text = re.sub(r'¨u', 'ü', text)
+        text = re.sub(r'“u', 'ü', text)
+        text = re.sub(r'"U', 'Ü', text)
         text = re.sub(r'¨U', 'Ü', text)
-
-
+        text = re.sub(r'“U', 'Ü', text)
 
         return text
     
@@ -235,7 +233,7 @@ def _clean_extracted_text(self, text: str) -> str:
         
         # Step 6: Fix common spacing issues
         text = re.sub(r'\s+([.,;:!?])', r'\1', text)  # Remove space before punctuation
-        
+
         # Protect URLs and email addresses to avoid inserting spaces within them
         protected = []
         def _mask(match):
@@ -388,6 +386,7 @@ def extract_bibliography_section(self, pdf_path: str) -> Optional[str]:
                 for page_num in range(len(doc)):
                     page = doc[page_num]
                     page_text = page.get_text()
+                    page_text = self._clean_extracted_text(page_text)
                     
                     # Look for bibliography section headers
                     bib_headers = [
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ build-backend = "hatchling.build"`
`41`	`41`
`42`	`42`	`[dependency-groups]`
`43`	`43`	`dev = [`
	`44`	`+ "ipykernel>=6.30.1",`
`44`	`45`	`"marimo>=0.14.16",`
`45`	`46`	`"pdf2text>=1.0.0",`
`46`	`47`	`]`