1+ #!/usr/bin/env python3
2+ # -*- coding: utf-8 -*-
3+ # Software Name: floss-toolbox
4+ # SPDX-FileCopyrightText: Copyright (c) Orange SA
5+ # SPDX-License-Identifier: Apache-2.0
6+ #
7+ # This software is distributed under the Apache 2.0 license,
8+ # the text of which is available at https://opensource.org/license/apache-2-0
9+ # or see the "LICENSE.txt" file for more details.
10+ #
11+ # Authors: See CONTRIBUTORS.txt
12+ # Software description: A toolbox of scripts to help work of forges admins and open source referents
13+
14+ """
15+ Script to scan all public repositories of a given GitHub organization
16+ for the presence of dependencies defined in a side file.
17+
18+ - The list of dependencies must be provided as a text file, given
19+ as a command-line argument, with one dependency name per line.
20+ - The script checks for common dependency files (package.json, requirements.txt, etc.)
21+ in the root of each repository. It does not check the dependencies versions.
22+ - Requires a GitHub API token set in the environment variable GITHUB_TOKEN.
23+
24+ WARNING: This is just a non-invasive and side tool to check repositories online.
25+ You should of course prefer use of tools plugged in your repos to look for vulnerabilities and keep
26+ updated dependencies, for example:
27+ - CodeQL: https://codeql.github.com/
28+ - Syft: https://github.com/anchore/syft
29+ - Grype: https://github.com/anchore/grype
30+ - Dependabot: https://docs.github.com/en/code-security/getting-started/dependabot-quickstart-guide
31+ - Renovate: https://github.com/renovatebot/renovate
32+
33+ The tool also checks the last state of the repository of the default branch. It does not look inside Git history.
34+
35+ WARNING: You may face API rate limits
36+
37+ How to set the GITHUB_TOKEN environment variable:
38+ On Linux/macOS:
39+ export GITHUB_TOKEN=your_personal_access_token
40+ On Windows (Command Prompt):
41+ set GITHUB_TOKEN=your_personal_access_token
42+ On Windows (PowerShell):
43+ $env:GITHUB_TOKEN="your_personal_access_token"
44+
45+ GitHub token permissions required:
46+ - For public repositories: The "public_repo" scope is sufficient.
47+ - For best results, generate a token with "public_repo" and "read:org" scopes.
48+
49+ Usage:
50+ python3.8 scan_github_organization_projects_for_dependencies.py <path/to/file_of_deps.txt>
51+
52+ Arguments:
53+ <path/to/file_of_deps.txt> Path to the file containing dependency names (one per line), nothing ekse
54+
55+ Exit codes:
56+ 0 Success
57+ 1 GITHUB_TOKEN is not set
58+ 2 Dependency file does not exist, is unreadable or empty
59+ """
60+
61+ import base64
62+ import os
63+ import requests
64+ import sys
65+ import time
66+
67+ # ====== CONFIGURATION ======
68+
69+ # The GitHub organization name
70+ GITHUB_ORGANIZATION = "Orange-OpenSource"
71+
72+ # Some files (and their locks) listing dependencies to look for.
73+ # WARNING: List is not curated, files supposed to be accessible, and versions not managed
74+ # NOTE: Comment the lines you do not want to process.
75+ DEPENDENCY_FILES = [
76+ # JavaSript, Node.js (NPM, Yarn and PNPM)
77+ "package.json" , "package-lock.json" , "yarn.lock" , "pnpm-lock.yaml" ,
78+ # Rust
79+ "Cargo.toml" , "Cargo.lock" ,
80+ # Go
81+ "go.mod" , "go.sum" ,
82+ # Java, Kotlin (Maven, Gradle)
83+ "pom.xml" , "build.gradle" , "build.gradle.kts" , "gradle.lockfile" ,
84+ "settings.gradle.kts" ,
85+ # Swift (Swift Package Manager, Cocoapods, Carthage)
86+ "Package.swift" , "Podfile" , "Cartfile" ,
87+ "Package.resolved" , "Podfile.lock" , "Cartfile.resolved" ,
88+ # Python
89+ "requirements.txt" ,"pyproject.toml" , "Pipfile.lock" , "conda-lock.yml" , "poetry.lock" ,
90+ # Ruby
91+ "Gemfile" , "Gemfile.lock"
92+ # Flutter / Dart
93+ "pubspec.yaml" , "pubspec.lock"
94+ ]
95+
96+ # GitHub Personal Access Token to request GitHub API
97+ GITHUB_TOKEN = os .getenv ("GITHUB_TOKEN" )
98+
99+ # Headers containing the token for the GitHub API requests
100+ HEADERS = {
101+ "Authorization" : f"token { GITHUB_TOKEN } " if GITHUB_TOKEN else "" ,
102+ "Accept" : "application/vnd.github.v3+json"
103+ }
104+
105+ # ======== SERVICE ========
106+
107+ EXIT_OK = 0
108+ EXIT_TOKEN_ISSUE = 1
109+ EXIT_DEPENDENCIES_ISSUE = 2
110+
111+ def load_dependencies (file_path ):
112+ """
113+ Reads a list of dependencies from a text file.
114+
115+ Args:
116+ file_path (str): Path to the text file containing dependency names (one per line).
117+
118+ Returns:
119+ list: A list of non-empty, stripped dependency names.
120+
121+ Raises:
122+ SystemExit: If the file does not exist, is unreadable or empty (EXIT_DEPENDENCIES_ISSUE).
123+ """
124+ if not os .path .isfile (file_path ):
125+ print (f"❌ Error: The file '{ file_path } ' does not exist." )
126+ sys .exit (EXIT_DEPENDENCIES_ISSUE )
127+ try :
128+ with open (file_path , "r" , encoding = "utf-8" ) as f :
129+ deps = [line .strip () for line in f if line .strip ()]
130+ except Exception as e :
131+ print (f"❌ Error reading file '{ file_path } ': { e } " )
132+ sys .exit (EXIT_DEPENDENCIES_ISSUE )
133+ if not deps :
134+ print (f"❌ Error: The file '{ file_path } ' is empty or contains no valid dependencies." )
135+ sys .exit (EXIT_DEPENDENCIES_ISSUE )
136+ return deps
137+
138+ def get_repos (org , headers ):
139+ """
140+ Retrieves all public repositories for the specified organization via the GitHub API.
141+
142+ Args:
143+ org (str): The GitHub organization name.
144+ headers (dict): HTTP headers including authorization.
145+
146+ Returns:
147+ list: A list of repository metadata dictionaries.
148+ """
149+ repos = []
150+ page = 1
151+ while True :
152+ url = f"https://api.github.com/orgs/{ org } /repos?per_page=100&page={ page } "
153+ res = requests .get (url , headers = headers )
154+ if res .status_code != 200 :
155+ print (f"⚠️ Warning: Got status code { res .status_code } for repos request ({ res .text } )." )
156+ break
157+ data = res .json ()
158+ if not data :
159+ break
160+ repos += data
161+ page += 1
162+ return repos
163+
164+ def get_default_branch (owner , repo , headers ):
165+ """
166+ Returns the default branch name for the repo.
167+
168+ Args:
169+ owner (str): Repository owner (organization or user).
170+ repo (str): Repository name.
171+ headers (dict): HTTP headers including authorization.
172+
173+ Returns:
174+ str: Default branch name or "main" if not found.
175+ """
176+ url = f"https://api.github.com/repos/{ owner } /{ repo } "
177+ res = requests .get (url , headers = headers )
178+ if res .status_code != 200 :
179+ print (f"⚠️ Warning: Got status code { res .status_code } for default branch request ({ res .text } )." )
180+ return "main"
181+ data = res .json ()
182+ return data .get ("default_branch" , "main" )
183+
184+ def get_repo_tree (owner , repo , branch , headers ):
185+ """
186+ Retrieves the full (recursive) file tree of a repo at the given branch.
187+
188+ Args:
189+ owner (str): Repository owner (organization or user).
190+ repo (str): Repository name.
191+ branch (str): Branch name (usually the default branch).
192+ headers (dict): HTTP headers including authorization.
193+
194+ Returns:
195+ list: A list of file metadata dictionaries (with 'path', 'type', etc.).
196+ """
197+ url = f"https://api.github.com/repos/{ owner } /{ repo } /git/trees/{ branch } ?recursive=1"
198+ res = requests .get (url , headers = headers )
199+ if res .status_code != 200 :
200+ print (f"⚠️ Warning: Got status code { res .status_code } for repo tree request." )
201+ return []
202+ data = res .json ()
203+ return data .get ("tree" , [])
204+
205+ def get_file_content_by_path (owner , repo , file_path , headers ):
206+ """
207+ Fetches and decodes the content of a file at any path in a GitHub repository using the API.
208+
209+ Args:
210+ owner (str): GitHub organization or user name.
211+ repo (str): Repository name.
212+ file_path (str): Path to the file in the repository.
213+ headers (dict): HTTP headers including authorization.
214+
215+ Returns:
216+ str or None: The decoded file content as a string, or None if not found or unreadable.
217+ """
218+ url = f"https://api.github.com/repos/{ owner } /{ repo } /contents/{ file_path } "
219+ res = requests .get (url , headers = headers )
220+ if res .status_code != 200 :
221+ print (f"⚠️ Warning: Got status code { res .status_code } for file content request ({ res .text } )." )
222+ return None
223+ content = res .json ()
224+ if isinstance (content , dict ) and 'content' in content and content .get ('encoding' ) == 'base64' :
225+ return base64 .b64decode (content ['content' ]).decode ('utf-8' , errors = 'ignore' )
226+ return None
227+
228+ def find_dependencies_in_text (text , deps ):
229+ """
230+ Returns a list of dependencies found in the given text.
231+
232+ Args:
233+ text (str): The file content to scan.
234+ deps (list): List of dependency names to search for.
235+
236+ Returns:
237+ list: Dependencies from 'deps' found in 'text'.
238+ """
239+ found = []
240+ for dep in deps :
241+ if dep in text :
242+ found .append (dep )
243+ return found
244+
245+ def main ():
246+ """
247+ Main function to orchestrate the scanning process.
248+
249+ - Checks environment and command-line arguments.
250+ - Loads dependencies to look for.
251+ - Retrieves and scans all repositories for dependencies, wherever they are in the tree.
252+ - Prints a progress message every 10 repositories.
253+ - For each repository where a dependency is found, prints the repository and detected dependencies
254+ - Displays the elapsed time and the number of projects with found dependencies.
255+ """
256+ if not GITHUB_TOKEN :
257+ print ("❌ Error: Please set the GITHUB_TOKEN environment variable with a valid GitHub personal access token." )
258+ sys .exit (EXIT_TOKEN_ISSUE )
259+
260+ if len (sys .argv ) < 2 :
261+ print ("Usage: python3.8 scan_github_organization_projects_for_dependencies.py path/to/deps.txt" )
262+ sys .exit (EXIT_DEPENDENCIES_ISSUE )
263+
264+ start_time = time .time ()
265+
266+ dependencies_file = sys .argv [1 ]
267+ loaded_dependencies = load_dependencies (dependencies_file )
268+ print (f"✅ Loaded '{ len (loaded_dependencies )} ' dependencies to check." )
269+ repos = get_repos (GITHUB_ORGANIZATION , HEADERS )
270+ print (f"✅ Found '{ len (repos )} ' repositories in '{ GITHUB_ORGANIZATION } '." )
271+
272+ repos_with_dependencies = 0
273+
274+ for idx , repo in enumerate (repos , 1 ):
275+ if idx % 10 == 0 or idx == 1 :
276+ print (f"🔎 Scanning repository { idx } / { len (repos )} ..." )
277+ repo_name = repo ["name" ]
278+ repo_url = repo ["html_url" ]
279+ branch = get_default_branch (GITHUB_ORGANIZATION , repo_name , HEADERS )
280+ tree = get_repo_tree (GITHUB_ORGANIZATION , repo_name , branch , HEADERS )
281+ found_deps = {}
282+ for file in tree :
283+ if file ["type" ] == "blob" :
284+ filename = file ["path" ].split ("/" )[- 1 ]
285+ if filename in DEPENDENCY_FILES :
286+ content = get_file_content_by_path (GITHUB_ORGANIZATION , repo_name , file ["path" ], HEADERS )
287+ if content :
288+ deps_in_file = find_dependencies_in_text (content , loaded_dependencies )
289+ if deps_in_file :
290+ found_deps [file ["path" ]] = deps_in_file
291+ if found_deps :
292+ repos_with_dependencies += 1
293+ print (f"\n 🎯 Repository: { repo_name } \n URL: { repo_url } " )
294+ print ("➡️ Dependencies found in these files:" )
295+ for path , deps in found_deps .items ():
296+ print (f" { path } :" )
297+ for dep in deps :
298+ print (f" - { dep } " )
299+
300+ elapsed_time = time .time () - start_time
301+ print ("\n ====== Scan summary ======" )
302+ print (f"📝 Repositories with at least one found dependency: { repos_with_dependencies } " )
303+ print (f"⌛ Elapsed time: { elapsed_time :.2f} seconds" )
304+
305+ # ========= MAIN =========
306+
307+ if __name__ == "__main__" :
308+ main ()
0 commit comments