Skip to content

Commit b558ed2

Browse files
committed
feat: apply same filters from go minr
feat: create standalone presenters that implement abstractpresenter feat: add progress tracking while scanning, clean folder-scan command arguments feat: use progress instead of rich feat: add folder hash sub-command, fix hfh spinner feat: update docs
1 parent e2a64c9 commit b558ed2

File tree

9 files changed

+611
-247
lines changed

9 files changed

+611
-247
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1212
## [1.21.0] - 2025-02-10
1313
### Added
1414
- Add folder-scan subcommand
15+
- Add folder-hash subcommand
1516
- Add AbstractPresenter class for presenting output in a given format
1617
- Add several reusable helper functions for constructing config objects from CLI args
1718

Diff for: docs/source/index.rst

+73
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,79 @@ Convert file format to plain, SPDX-Lite, CycloneDX or csv.
228228
* - --format <format>, -f <format>
229229
- Indicates the result output format: {plain, cyclonedx, spdxlite, csv}. (optional - default plain)
230230

231+
--------------------------------
232+
Folder Scanning: folder-scan, fs
233+
--------------------------------
234+
235+
Performs a comprehensive scan of a directory using folder hashing to identify components and their matches.
236+
237+
.. code-block:: bash
238+
239+
scanoss-py folder-scan <directory>
240+
241+
.. list-table::
242+
:widths: 20 30
243+
:header-rows: 1
244+
245+
* - Argument
246+
- Description
247+
* - --output <file name>, -o <file name>
248+
- Output result file name (optional - default STDOUT)
249+
* - --format <format>, -f <format>
250+
- Output format: {json} (optional - default json)
251+
* - --timeout <seconds>, -M <seconds>
252+
- Timeout in seconds for API communication (optional - default 600)
253+
* - --best-match, -bm
254+
- Enable best match mode (optional - default: False)
255+
* - --threshold <1-100>
256+
- Threshold for result matching (optional - default: 100)
257+
* - --settings <file>, -st <file>
258+
- Settings file to use for scanning (optional - default scanoss.json)
259+
* - --skip-settings-file, -stf
260+
- Skip default settings file (scanoss.json) if it exists
261+
* - --key <token>, -k <token>
262+
- SCANOSS API Key token (optional - not required for default OSSKB URL)
263+
* - --proxy <url>
264+
- Proxy URL to use for connections
265+
* - --pac <file/url>
266+
- Proxy auto configuration. Specify a file, http url or "auto"
267+
* - --ca-cert <file>
268+
- Alternative certificate PEM file
269+
* - --api2url <url>
270+
- SCANOSS gRPC API 2.0 URL (optional - default: https://api.osskb.org)
271+
* - --grpc-proxy <url>
272+
- GRPC Proxy URL to use for connections
273+
274+
--------------------------------
275+
Folder Hashing: folder-hash, fh
276+
--------------------------------
277+
278+
Generates cryptographic hashes for files in a given directory and its subdirectories.
279+
280+
.. code-block:: bash
281+
282+
scanoss-py folder-hash <directory>
283+
284+
.. list-table::
285+
:widths: 20 30
286+
:header-rows: 1
287+
288+
* - Argument
289+
- Description
290+
* - --output <file name>, -o <file name>
291+
- Output result file name (optional - default STDOUT)
292+
* - --format <format>, -f <format>
293+
- Output format: {json} (optional - default json)
294+
* - --settings <file>, -st <file>
295+
- Settings file to use for scanning (optional - default scanoss.json)
296+
* - --skip-settings-file, -stf
297+
- Skip default settings file (scanoss.json) if it exists
298+
299+
Both commands also support these general options:
300+
* --debug, -d: Enable debug messages
301+
* --trace, -t: Enable trace messages
302+
* --quiet, -q: Enable quiet mode
303+
231304
-----------------
232305
Component:
233306
-----------------

Diff for: setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ install_requires =
3838
packageurl-python
3939
pathspec
4040
jsonschema
41+
crc
4142

4243

4344
[options.extras_require]

Diff for: src/scanoss/cli.py

+76-4
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030

3131
import pypac
3232

33+
from scanoss.scanners.folder_hasher import (
34+
FolderHasher,
35+
create_folder_hasher_config_from_args,
36+
)
3337
from scanoss.scanossgrpc import (
3438
ScanossGrpc,
3539
ScanossGrpcError,
@@ -316,6 +320,7 @@ def setup_args() -> None: # noqa: PLR0915
316320
for p in [c_crypto, c_vulns, c_semgrep]:
317321
p.add_argument('--purl', '-p', type=str, nargs='*', help='Package URL - PURL to process.')
318322
p.add_argument('--input', '-i', type=str, help='Input file name')
323+
319324
# Common Component sub-command options
320325
for p in [c_crypto, c_vulns, c_search, c_versions, c_semgrep, c_provenance]:
321326
p.add_argument('--output', '-o', type=str, help='Output result file name (optional - default stdout).')
@@ -474,35 +479,68 @@ def setup_args() -> None: # noqa: PLR0915
474479
)
475480
p_undeclared.set_defaults(func=inspect_undeclared)
476481

482+
# Sub-command: folder-scan
477483
p_folder_scan = subparsers.add_parser(
478484
'folder-scan',
485+
aliases=['fs'],
479486
description=f'Scan the given directory using folder hashing: {__version__}',
480487
help='Scan the given directory using folder hashing',
481488
)
482489
p_folder_scan.add_argument('scan_dir', metavar='FILE/DIR', type=str, nargs='?', help='The root directory to scan')
483490
p_folder_scan.add_argument('--output', '-o', type=str, help='Output result file name (optional - default stdout).')
491+
p_folder_scan.add_argument(
492+
'--timeout',
493+
'-M',
494+
type=int,
495+
default=600,
496+
help='Timeout (in seconds) for API communication (optional - default 600)',
497+
)
484498
p_folder_scan.add_argument(
485499
'--format',
486500
'-f',
487501
type=str,
488-
choices=['plain', 'json'],
489-
help='Result output format (optional - default: plain)',
502+
choices=['json'],
503+
default='json',
504+
help='Result output format (optional - default: json)',
490505
)
491506
p_folder_scan.add_argument(
492507
'--best-match',
493508
'-bm',
494509
action='store_true',
510+
default=False,
495511
help='Enable best match mode (optional - default: False)',
496512
)
497513
p_folder_scan.add_argument(
498514
'--threshold',
499515
type=int,
516+
choices=range(1, 101),
517+
metavar='1-100',
518+
default=100,
500519
help='Threshold for result matching (optional - default: 100)',
501520
)
502521
p_folder_scan.set_defaults(func=folder_hashing_scan)
503522

523+
# Sub-command: folder-hash
524+
p_folder_hash = subparsers.add_parser(
525+
'folder-hash',
526+
aliases=['fh'],
527+
description=f'Produce a folder hash for the given directory: {__version__}',
528+
help='Produce a folder hash for the given directory',
529+
)
530+
p_folder_hash.add_argument('scan_dir', metavar='FILE/DIR', type=str, nargs='?', help='A file or folder to scan')
531+
p_folder_hash.add_argument('--output', '-o', type=str, help='Output result file name (optional - default stdout).')
532+
p_folder_hash.add_argument(
533+
'--format',
534+
'-f',
535+
type=str,
536+
choices=['json'],
537+
default='json',
538+
help='Result output format (optional - default: json)',
539+
)
540+
p_folder_hash.set_defaults(func=folder_hash)
541+
504542
# Scanoss settings options
505-
for p in [p_folder_scan, p_scan, p_wfp]:
543+
for p in [p_folder_scan, p_scan, p_wfp, p_folder_hash]:
506544
p.add_argument(
507545
'--settings',
508546
'-st',
@@ -530,7 +568,7 @@ def setup_args() -> None: # noqa: PLR0915
530568
p.add_argument('-s', '--status', type=str, help='Save summary data into Markdown file')
531569

532570
# Global Scan command options
533-
for p in [p_scan, p_folder_scan]:
571+
for p in [p_scan]:
534572
p.add_argument(
535573
'--apiurl', type=str, help='SCANOSS API URL (optional - default: https://api.osskb.org/scan/direct)'
536574
)
@@ -614,6 +652,7 @@ def setup_args() -> None: # noqa: PLR0915
614652
p_undeclared,
615653
p_copyleft,
616654
p_folder_scan,
655+
p_folder_hash,
617656
]:
618657
p.add_argument('--debug', '-d', action='store_true', help='Enable debug messages')
619658
p.add_argument('--trace', '-t', action='store_true', help='Enable trace messages, including API posts')
@@ -1499,6 +1538,39 @@ def folder_hashing_scan(parser, args):
14991538
sys.exit(1)
15001539

15011540

1541+
def folder_hash(parser, args):
1542+
"""Run the "folder-hash" sub-command
1543+
1544+
Args:
1545+
parser (ArgumentParser): command line parser object
1546+
args (Namespace): Parsed arguments
1547+
"""
1548+
try:
1549+
if not args.scan_dir:
1550+
print_stderr('ERROR: Please specify a directory to scan')
1551+
parser.parse_args([args.subparser, '-h'])
1552+
sys.exit(1)
1553+
1554+
if not os.path.exists(args.scan_dir) or not os.path.isdir(args.scan_dir):
1555+
print_stderr(f'ERROR: The specified directory {args.scan_dir} does not exist')
1556+
sys.exit(1)
1557+
1558+
folder_hasher_config = create_folder_hasher_config_from_args(args)
1559+
scanoss_settings = get_scanoss_settings_from_args(args)
1560+
1561+
folder_hasher = FolderHasher(
1562+
scan_dir=args.scan_dir,
1563+
config=folder_hasher_config,
1564+
scanoss_settings=scanoss_settings,
1565+
)
1566+
1567+
folder_hasher.hash_directory(args.scan_dir)
1568+
folder_hasher.present(output_file=args.output, output_format=args.format)
1569+
except Exception as e:
1570+
print_stderr(f'ERROR: {e}')
1571+
sys.exit(1)
1572+
1573+
15021574
def get_scanoss_settings_from_args(args):
15031575
scanoss_settings = None
15041576
if not args.skip_settings_file:

Diff for: src/scanoss/file_filters.py

+36
Original file line numberDiff line numberDiff line change
@@ -514,3 +514,39 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
514514
self.print_debug(f'Skipping file: {file_rel_path} (matches custom pattern)')
515515
return True
516516
return False
517+
518+
def _should_skip_file_for_hfh(self, file_path: Path) -> bool:
519+
"""
520+
Check if a file should be skipped during folder hashing scan.
521+
522+
Args:
523+
file_path (Path): The path to the file to check.
524+
525+
Returns:
526+
bool: True if the file should be skipped, False otherwise.
527+
"""
528+
try:
529+
if (
530+
any(part.startswith('.') for part in file_path.parts) # Hidden files/folders
531+
or file_path.is_symlink() # Symlinks
532+
or file_path.stat().st_size == 0 # Empty files
533+
):
534+
self.print_debug(f'Skipping file: {file_path} (hidden/symlink/empty)')
535+
return True
536+
537+
# Files ending with null
538+
if file_path.suffix.lower() == '.txt':
539+
try:
540+
with open(file_path, 'rb') as f:
541+
if f.read().endswith(b'\x00'):
542+
self.print_debug(f'Skipping file: {file_path} (text file ending with null)')
543+
return True
544+
except (OSError, IOError):
545+
self.print_debug(f'Skipping file: {file_path} (cannot read file content)')
546+
return True
547+
548+
return False
549+
550+
except Exception as e:
551+
self.print_debug(f'Error checking file {file_path}: {str(e)}')
552+
return True

0 commit comments

Comments
 (0)