@@ -61,52 +61,94 @@ def generate_gexf_nodes_for_topics(self, topics):
6161 topics_lower = [t .lower () for t in topics ]
6262 placeholders = "," .join (["?" ] * len (topics_lower ))
6363
64- # Debug: Check what columns actually exist in the repos table
65- schema_query = "DESCRIBE repos"
66- # try:
67- # schema_result = self.con.execute(schema_query).fetchall()
68- # # print("Repos table schema:")
69- # # for col in schema_result:
70- # # print(f" {col}")
71- # except Exception as e:
72- # print(f"Could not get schema: {e}")
64+ # Use consistent memory and thread settings with TopicService
65+ available_memory = psutil .virtual_memory ().available
66+ memory_limit = min (available_memory * 0.3 , 0.5 * 1024 * 1024 * 1024 ) # Use 30% of available memory, max 0.5GB
67+ self .con .execute (f"SET memory_limit TO '{ int (memory_limit )} B'" )
68+
69+ # Set conservative thread count
70+ cpu_count = psutil .cpu_count (logical = False ) or 1
71+ thread_count = max (1 , min (cpu_count , 2 )) # Use at most 2 threads
72+ self .con .execute (f"SET threads TO { thread_count } " )
73+
74+ # Note: Cannot create indexes on read-only database
75+ # The query will rely on DuckDB's built-in query optimization
7376
77+ # Optimized query with better structure and materialized CTEs
7478 query = f"""
75- WITH matching_repos AS (
79+ WITH matching_repos AS MATERIALIZED (
7680 SELECT DISTINCT r.nameWithOwner
7781 FROM repos r
78- JOIN repo_topics t ON r.nameWithOwner = t.repo
82+ INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
7983 WHERE LOWER(t.topic) IN ({ placeholders } )
8084 ),
81- repo_topics_agg AS (
85+ repo_topics_agg AS MATERIALIZED (
8286 SELECT
8387 r.nameWithOwner,
8488 GROUP_CONCAT(t.topic, '|') AS topics
8589 FROM repos r
86- JOIN repo_topics t ON r.nameWithOwner = t.repo
87- JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
90+ INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
91+ INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
8892 GROUP BY r.nameWithOwner
8993 )
9094 SELECT
9195 r.nameWithOwner,
92- r. stars,
93- r. forks,
94- r. watchers,
95- r. isArchived,
96- r. languageCount,
97- r. pullRequests,
98- r. issues,
99- r. primaryLanguage,
96+ COALESCE(r.stars, 0) as stars,
97+ COALESCE(r.forks, 0) as forks,
98+ COALESCE(r.watchers, 0) as watchers,
99+ COALESCE(r.isArchived, false) as isArchived,
100+ COALESCE(r.languageCount, 0) as languageCount,
101+ COALESCE(r.pullRequests, 0) as pullRequests,
102+ COALESCE(r.issues, 0) as issues,
103+ COALESCE(r.primaryLanguage, '') as primaryLanguage,
100104 r.createdAt,
101- r. license,
105+ COALESCE(r.license, '') as license,
102106 rt.topics,
103107 r.bigquery_contributors,
104108 r.bigquery_stargazers
105109 FROM repos r
106- JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
107- JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner;
110+ INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
111+ INNER JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
112+ ORDER BY r.stars DESC, r.forks DESC
113+ LIMIT 10000; -- Limit results to prevent memory issues
108114 """
109- result = self .con .execute (query , topics_lower ).fetchall ()
115+
116+ # Execute query with optimized settings
117+ try :
118+ result = self .con .execute (query , topics_lower ).fetchall ()
119+ except Exception as e :
120+ print (f"Query failed, falling back to simpler version: { e } " )
121+ # Fallback to simpler query if the optimized one fails
122+ fallback_query = f"""
123+ SELECT DISTINCT
124+ r.nameWithOwner,
125+ COALESCE(r.stars, 0) as stars,
126+ COALESCE(r.forks, 0) as forks,
127+ COALESCE(r.watchers, 0) as watchers,
128+ COALESCE(r.isArchived, false) as isArchived,
129+ COALESCE(r.languageCount, 0) as languageCount,
130+ COALESCE(r.pullRequests, 0) as pullRequests,
131+ COALESCE(r.issues, 0) as issues,
132+ COALESCE(r.primaryLanguage, '') as primaryLanguage,
133+ r.createdAt,
134+ COALESCE(r.license, '') as license,
135+ GROUP_CONCAT(t.topic, '|') as topics,
136+ r.bigquery_contributors,
137+ r.bigquery_stargazers
138+ FROM repos r
139+ INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
140+ WHERE LOWER(t.topic) IN ({ placeholders } )
141+ GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived,
142+ r.languageCount, r.pullRequests, r.issues, r.primaryLanguage,
143+ r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
144+ ORDER BY r.stars DESC, r.forks DESC
145+ LIMIT 10000;
146+ """
147+ result = self .con .execute (fallback_query , topics_lower ).fetchall ()
148+
149+ # Reset memory settings
150+ self .con .execute ("SET memory_limit TO DEFAULT" )
151+ self .con .execute ("SET threads TO DEFAULT" )
110152
111153 # Debug: Print the first few rows to see what we're getting
112154 # if result:
0 commit comments