Skip to content

Commit f1fb496

Browse files
committed
fix: Security improvements and file splitting for code quality
Security fixes: - Remove .env from git tracking (was exposing secrets) - Fix SQL injection in admin.py document search with escape_like() - Add rate limiting to sensitive admin endpoints (update_user, system_cleanup, merge_tags) File splitting for 800-line limit compliance: - Split ai_service.py (1058→460 lines) into 3 modules: - ai_service.py: Main service class - ai_tag_suggestion.py: Tag/title suggestion logic - ai_config_service.py: Configuration handling - Split ml_analytics_service.py (821→310 lines) into 3 modules: - ml_analytics_service.py: Main service class - ml_document_analysis.py: Document content analysis - ml_corpus_analysis.py: Clustering and trend analysis
1 parent bfd7886 commit f1fb496

8 files changed

Lines changed: 1260 additions & 1382 deletions

File tree

.env

Lines changed: 0 additions & 57 deletions
This file was deleted.

app/routes/admin.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from flask import Blueprint, jsonify, request
77
from flask_jwt_extended import jwt_required
88
from sqlalchemy import func, desc, and_
9+
from app import limiter
910
from datetime import datetime, timedelta, timezone
1011
from app import db
1112
from app.models.user import User
@@ -151,6 +152,7 @@ def get_user_details(user_id):
151152

152153
@admin_bp.route('/admin/users/<int:user_id>', methods=['PUT'])
153154
@jwt_required()
155+
@limiter.limit("10 per minute")
154156
def update_user(user_id):
155157
"""Update user information"""
156158
try:
@@ -197,9 +199,10 @@ def list_all_documents():
197199
query = Document.query
198200

199201
if search:
202+
search_escaped = escape_like(search)
200203
query = query.filter(
201-
Document.title.ilike(f'%{search}%') |
202-
Document.markdown_content.ilike(f'%{search}%')
204+
Document.title.ilike(f'%{search_escaped}%') |
205+
Document.markdown_content.ilike(f'%{search_escaped}%')
203206
)
204207

205208
def serialize_doc_with_owner(doc):
@@ -286,6 +289,7 @@ def get_system_stats():
286289

287290
@admin_bp.route('/admin/system/cleanup', methods=['POST'])
288291
@jwt_required()
292+
@limiter.limit("5 per hour")
289293
def system_cleanup():
290294
"""Perform system cleanup operations"""
291295
try:
@@ -331,6 +335,7 @@ def system_cleanup():
331335

332336
@admin_bp.route('/admin/tags/merge', methods=['POST'])
333337
@jwt_required()
338+
@limiter.limit("20 per minute")
334339
def merge_tags():
335340
"""Merge duplicate tags"""
336341
try:

app/services/ai_config_service.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
"""
2+
AI Configuration Service
3+
Handles AI configuration loading, saving, and connection testing
4+
"""
5+
6+
import os
7+
import logging
8+
from typing import Dict, Any
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def get_env_keys() -> Dict[str, str]:
14+
"""Load API keys from environment variables"""
15+
return {
16+
'openai': os.getenv('OPENAI_API_KEY', ''),
17+
'anthropic': os.getenv('ANTHROPIC_API_KEY', ''),
18+
'google': os.getenv('GOOGLE_API_KEY', ''),
19+
'local': os.getenv('LOCAL_LLM_URL', 'http://localhost:8080')
20+
}
21+
22+
23+
def get_default_config(env_keys: Dict[str, str]) -> Dict[str, Any]:
24+
"""Get default configuration with environment variable fallbacks"""
25+
default_llm_provider = os.getenv('DEFAULT_LLM_PROVIDER', 'openai')
26+
27+
if default_llm_provider == 'local':
28+
default_llm_model = os.getenv('LOCAL_LLM_MODEL', 'llama2')
29+
else:
30+
default_llm_model = os.getenv('DEFAULT_LLM_MODEL', 'gpt-3.5-turbo')
31+
32+
default_ocr_service = os.getenv('DEFAULT_OCR_SERVICE', 'tesseract')
33+
default_api_key = env_keys.get(default_llm_provider, '')
34+
default_ai_tags_enabled = os.getenv('DEFAULT_AI_TAGS_ENABLED', 'true').lower() == 'true'
35+
default_ai_summary_enabled = os.getenv('DEFAULT_AI_SUMMARY_ENABLED', 'false').lower() == 'true'
36+
37+
return {
38+
'ocrService': default_ocr_service,
39+
'ocrApiKey': '',
40+
'llmProvider': default_llm_provider,
41+
'llmApiKey': default_api_key,
42+
'llmModel': default_llm_model,
43+
'enableAiTags': default_ai_tags_enabled,
44+
'enableAiSummary': default_ai_summary_enabled
45+
}
46+
47+
48+
def mask_api_key(api_key: str) -> str:
49+
"""Mask API key for safe display"""
50+
if not api_key:
51+
return ''
52+
if len(api_key) > 8:
53+
return api_key[:4] + '*' * (len(api_key) - 8) + api_key[-4:]
54+
return '*' * len(api_key)
55+
56+
57+
def get_safe_config(config: Dict[str, Any]) -> Dict[str, Any]:
58+
"""Get configuration with masked API keys for safe display"""
59+
safe_config: Dict[str, Any] = dict(config)
60+
61+
if 'llmApiKey' in safe_config and safe_config['llmApiKey']:
62+
safe_config['llmApiKey'] = mask_api_key(safe_config['llmApiKey'])
63+
64+
if 'ocrApiKey' in safe_config and safe_config['ocrApiKey']:
65+
safe_config['ocrApiKey'] = mask_api_key(safe_config['ocrApiKey'])
66+
67+
return safe_config
68+
69+
70+
def test_llm_connection(config_data: Dict) -> Dict:
71+
"""Test LLM provider connection"""
72+
try:
73+
provider = config_data.get('llmProvider', 'openai')
74+
api_key = config_data.get('llmApiKey', '')
75+
76+
if not api_key and provider != 'local':
77+
return {
78+
'success': False,
79+
'error': 'API key is required'
80+
}
81+
82+
if provider == 'local' and not api_key:
83+
api_key = os.getenv('LOCAL_LLM_URL', 'http://localhost:8080')
84+
85+
if provider == 'openai':
86+
if not api_key.startswith(('sk-', 'sk-proj-')):
87+
return {
88+
'success': False,
89+
'error': 'Invalid OpenAI API key format. Keys should start with "sk-"'
90+
}
91+
92+
if len(api_key) < 20:
93+
return {
94+
'success': False,
95+
'error': 'OpenAI API key appears to be too short'
96+
}
97+
98+
return {
99+
'success': True,
100+
'message': 'OpenAI API key format is valid'
101+
}
102+
103+
elif provider == 'anthropic':
104+
if not api_key.startswith('sk-ant-'):
105+
return {
106+
'success': False,
107+
'error': 'Invalid Anthropic API key format. Keys should start with "sk-ant-"'
108+
}
109+
110+
if len(api_key) < 30:
111+
return {
112+
'success': False,
113+
'error': 'Anthropic API key appears to be too short'
114+
}
115+
116+
return {
117+
'success': True,
118+
'message': 'Anthropic (Claude) API key format is valid'
119+
}
120+
121+
elif provider == 'google':
122+
if len(api_key) < 10:
123+
return {
124+
'success': False,
125+
'error': 'Google API key appears to be too short'
126+
}
127+
128+
if not api_key.replace('-', '').replace('_', '').isalnum():
129+
return {
130+
'success': False,
131+
'error': 'Google API key format appears invalid'
132+
}
133+
134+
return {
135+
'success': True,
136+
'message': 'Google (Gemini) API key format is valid'
137+
}
138+
139+
elif provider == 'local':
140+
if not api_key:
141+
return {
142+
'success': False,
143+
'error': 'Local LLM server URL is required'
144+
}
145+
146+
if not api_key.startswith(('http://', 'https://')):
147+
return {
148+
'success': False,
149+
'error': 'Local LLM server URL must start with http:// or https://'
150+
}
151+
152+
try:
153+
from urllib.parse import urlparse
154+
parsed = urlparse(api_key)
155+
if not parsed.netloc:
156+
return {
157+
'success': False,
158+
'error': 'Invalid Local LLM server URL format'
159+
}
160+
except Exception:
161+
return {
162+
'success': False,
163+
'error': 'Invalid Local LLM server URL format'
164+
}
165+
166+
return {
167+
'success': True,
168+
'message': f'Local LLM server URL format is valid: {api_key}'
169+
}
170+
171+
else:
172+
if len(api_key) < 10:
173+
return {
174+
'success': False,
175+
'error': 'API key appears to be invalid'
176+
}
177+
return {
178+
'success': True,
179+
'message': f'{provider} API key provided (format not validated)'
180+
}
181+
182+
except Exception as e:
183+
return {
184+
'success': False,
185+
'error': f'LLM test failed: {str(e)}'
186+
}
187+
188+
189+
def test_ocr_connection(config_data: Dict) -> Dict:
190+
"""Test OCR service connection"""
191+
try:
192+
service = config_data.get('ocrService', 'tesseract')
193+
api_key = config_data.get('ocrApiKey', '')
194+
195+
if service == 'tesseract':
196+
return {'success': True, 'message': 'Tesseract is available locally'}
197+
198+
if service == 'google-vision':
199+
if not api_key:
200+
google_cloud_key = os.getenv('GOOGLE_CLOUD_API_KEY', '')
201+
google_llm_key = os.getenv('GOOGLE_API_KEY', '')
202+
203+
if google_cloud_key:
204+
api_key = google_cloud_key
205+
elif google_llm_key:
206+
api_key = google_llm_key
207+
208+
if api_key and len(api_key) >= 10:
209+
return {
210+
'success': True,
211+
'message': 'Google Vision API configuration is valid'
212+
}
213+
else:
214+
return {
215+
'success': False,
216+
'error': 'Google Vision API key is required'
217+
}
218+
219+
if not api_key:
220+
return {
221+
'success': False,
222+
'error': 'API key is required for cloud OCR services'
223+
}
224+
225+
if len(api_key) < 10:
226+
return {
227+
'success': False,
228+
'error': 'API key appears to be invalid'
229+
}
230+
231+
return {
232+
'success': True,
233+
'message': f'{service} OCR connection test successful'
234+
}
235+
236+
except Exception as e:
237+
return {
238+
'success': False,
239+
'error': f'OCR test failed: {str(e)}'
240+
}

0 commit comments

Comments
 (0)