Skip to content

Commit a6b96e0

Browse files
committed
optimize the query
1 parent a23f1b8 commit a6b96e0

File tree

1 file changed

+68
-26
lines changed

1 file changed

+68
-26
lines changed

backend/app/services/gexf_node_service.py

Lines changed: 68 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -61,52 +61,94 @@ def generate_gexf_nodes_for_topics(self, topics):
6161
topics_lower = [t.lower() for t in topics]
6262
placeholders = ",".join(["?"] * len(topics_lower))
6363

64-
# Debug: Check what columns actually exist in the repos table
65-
schema_query = "DESCRIBE repos"
66-
# try:
67-
# schema_result = self.con.execute(schema_query).fetchall()
68-
# # print("Repos table schema:")
69-
# # for col in schema_result:
70-
# # print(f" {col}")
71-
# except Exception as e:
72-
# print(f"Could not get schema: {e}")
64+
# Use consistent memory and thread settings with TopicService
65+
available_memory = psutil.virtual_memory().available
66+
memory_limit = min(available_memory * 0.3, 0.5 * 1024 * 1024 * 1024) # Use 30% of available memory, max 0.5GB
67+
self.con.execute(f"SET memory_limit TO '{int(memory_limit)}B'")
68+
69+
# Set conservative thread count
70+
cpu_count = psutil.cpu_count(logical=False) or 1
71+
thread_count = max(1, min(cpu_count, 2)) # Use at most 2 threads
72+
self.con.execute(f"SET threads TO {thread_count}")
73+
74+
# Note: Cannot create indexes on read-only database
75+
# The query will rely on DuckDB's built-in query optimization
7376

77+
# Optimized query with better structure and materialized CTEs
7478
query = f"""
75-
WITH matching_repos AS (
79+
WITH matching_repos AS MATERIALIZED (
7680
SELECT DISTINCT r.nameWithOwner
7781
FROM repos r
78-
JOIN repo_topics t ON r.nameWithOwner = t.repo
82+
INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
7983
WHERE LOWER(t.topic) IN ({placeholders})
8084
),
81-
repo_topics_agg AS (
85+
repo_topics_agg AS MATERIALIZED (
8286
SELECT
8387
r.nameWithOwner,
8488
GROUP_CONCAT(t.topic, '|') AS topics
8589
FROM repos r
86-
JOIN repo_topics t ON r.nameWithOwner = t.repo
87-
JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
90+
INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
91+
INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
8892
GROUP BY r.nameWithOwner
8993
)
9094
SELECT
9195
r.nameWithOwner,
92-
r.stars,
93-
r.forks,
94-
r.watchers,
95-
r.isArchived,
96-
r.languageCount,
97-
r.pullRequests,
98-
r.issues,
99-
r.primaryLanguage,
96+
COALESCE(r.stars, 0) as stars,
97+
COALESCE(r.forks, 0) as forks,
98+
COALESCE(r.watchers, 0) as watchers,
99+
COALESCE(r.isArchived, false) as isArchived,
100+
COALESCE(r.languageCount, 0) as languageCount,
101+
COALESCE(r.pullRequests, 0) as pullRequests,
102+
COALESCE(r.issues, 0) as issues,
103+
COALESCE(r.primaryLanguage, '') as primaryLanguage,
100104
r.createdAt,
101-
r.license,
105+
COALESCE(r.license, '') as license,
102106
rt.topics,
103107
r.bigquery_contributors,
104108
r.bigquery_stargazers
105109
FROM repos r
106-
JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
107-
JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner;
110+
INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
111+
INNER JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
112+
ORDER BY r.stars DESC, r.forks DESC
113+
LIMIT 10000; -- Limit results to prevent memory issues
108114
"""
109-
result = self.con.execute(query, topics_lower).fetchall()
115+
116+
# Execute query with optimized settings
117+
try:
118+
result = self.con.execute(query, topics_lower).fetchall()
119+
except Exception as e:
120+
print(f"Query failed, falling back to simpler version: {e}")
121+
# Fallback to simpler query if the optimized one fails
122+
fallback_query = f"""
123+
SELECT DISTINCT
124+
r.nameWithOwner,
125+
COALESCE(r.stars, 0) as stars,
126+
COALESCE(r.forks, 0) as forks,
127+
COALESCE(r.watchers, 0) as watchers,
128+
COALESCE(r.isArchived, false) as isArchived,
129+
COALESCE(r.languageCount, 0) as languageCount,
130+
COALESCE(r.pullRequests, 0) as pullRequests,
131+
COALESCE(r.issues, 0) as issues,
132+
COALESCE(r.primaryLanguage, '') as primaryLanguage,
133+
r.createdAt,
134+
COALESCE(r.license, '') as license,
135+
GROUP_CONCAT(t.topic, '|') as topics,
136+
r.bigquery_contributors,
137+
r.bigquery_stargazers
138+
FROM repos r
139+
INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
140+
WHERE LOWER(t.topic) IN ({placeholders})
141+
GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived,
142+
r.languageCount, r.pullRequests, r.issues, r.primaryLanguage,
143+
r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
144+
ORDER BY r.stars DESC, r.forks DESC
145+
LIMIT 10000;
146+
"""
147+
result = self.con.execute(fallback_query, topics_lower).fetchall()
148+
149+
# Reset memory settings
150+
self.con.execute("SET memory_limit TO DEFAULT")
151+
self.con.execute("SET threads TO DEFAULT")
110152

111153
# Debug: Print the first few rows to see what we're getting
112154
# if result:

0 commit comments

Comments
 (0)