Skip to content

Commit 293aed4

Browse files
committed
feat: apply same filters from go minr
feat: create standalone presenters that implement abstractpresenter
1 parent e2a64c9 commit 293aed4

File tree

5 files changed

+146
-65
lines changed

5 files changed

+146
-65
lines changed

Diff for: src/scanoss/cli.py

+3
Original file line numberDiff line numberDiff line change
@@ -492,11 +492,14 @@ def setup_args() -> None: # noqa: PLR0915
492492
'--best-match',
493493
'-bm',
494494
action='store_true',
495+
default=False,
495496
help='Enable best match mode (optional - default: False)',
496497
)
497498
p_folder_scan.add_argument(
498499
'--threshold',
499500
type=int,
501+
choices=range(1, 101),
502+
default=100,
500503
help='Threshold for result matching (optional - default: 100)',
501504
)
502505
p_folder_scan.set_defaults(func=folder_hashing_scan)

Diff for: src/scanoss/file_filters.py

+36
Original file line numberDiff line numberDiff line change
@@ -514,3 +514,39 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
514514
self.print_debug(f'Skipping file: {file_rel_path} (matches custom pattern)')
515515
return True
516516
return False
517+
518+
def _should_skip_file_for_hfh(self, file_path: Path) -> bool:
519+
"""
520+
Check if a file should be skipped during folder hashing scan.
521+
522+
Args:
523+
file_path (Path): The path to the file to check.
524+
525+
Returns:
526+
bool: True if the file should be skipped, False otherwise.
527+
"""
528+
try:
529+
if (
530+
any(part.startswith('.') for part in file_path.parts) # Hidden files/folders
531+
or file_path.is_symlink() # Symlinks
532+
or file_path.stat().st_size == 0 # Empty files
533+
):
534+
self.print_debug(f'Skipping file: {file_path} (hidden/symlink/empty)')
535+
return True
536+
537+
# Files ending with null
538+
if file_path.suffix.lower() == '.txt':
539+
try:
540+
with open(file_path, 'rb') as f:
541+
if f.read().endswith(b'\x00'):
542+
self.print_debug(f'Skipping file: {file_path} (text file ending with null)')
543+
return True
544+
except (OSError, IOError):
545+
self.print_debug(f'Skipping file: {file_path} (cannot read file content)')
546+
return True
547+
548+
return False
549+
550+
except Exception as e:
551+
self.print_debug(f'Error checking file {file_path}: {str(e)}')
552+
return True

Diff for: src/scanoss/results.py

+79-54
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,70 @@
5050
}
5151

5252

53-
class Results(AbstractPresenter, ScanossBase):
53+
class ResultsPresenter(AbstractPresenter):
54+
"""
55+
SCANOSS Results presenter class
56+
Handles the presentation of the scan results
57+
"""
58+
59+
def __init__(self, results_instance, **kwargs):
60+
super().__init__(**kwargs)
61+
self.results = results_instance
62+
63+
def _format_json_output(self) -> str:
64+
"""
65+
Format the output data into a JSON object
66+
"""
67+
68+
formatted_data = []
69+
for item in self.results.data:
70+
formatted_data.append(
71+
{
72+
'file': item.get('filename'),
73+
'status': item.get('status', 'N/A'),
74+
'match_type': item['id'],
75+
'matched': item.get('matched', 'N/A'),
76+
'purl': (item.get('purl')[0] if item.get('purl') else 'N/A'),
77+
'license': (item.get('licenses')[0].get('name', 'N/A') if item.get('licenses') else 'N/A'),
78+
}
79+
)
80+
try:
81+
return json.dumps({'results': formatted_data, 'total': len(formatted_data)}, indent=2)
82+
except Exception as e:
83+
self.base.print_stderr(f'ERROR: Problem formatting JSON output: {e}')
84+
return ''
85+
86+
def _format_plain_output(self) -> str:
87+
"""Format the output data into a plain text string
88+
89+
Returns:
90+
str: The formatted output data
91+
"""
92+
if not self.results.data:
93+
msg = 'No results to present'
94+
return msg
95+
96+
formatted = ''
97+
for item in self.results.data:
98+
formatted += f'{self._format_plain_output_item(item)}\n'
99+
return formatted
100+
101+
@staticmethod
102+
def _format_plain_output_item(item):
103+
purls = item.get('purl', [])
104+
licenses = item.get('licenses', [])
105+
106+
return (
107+
f'File: {item.get("filename")}\n'
108+
f'Match type: {item.get("id")}\n'
109+
f'Status: {item.get("status", "N/A")}\n'
110+
f'Matched: {item.get("matched", "N/A")}\n'
111+
f'Purl: {purls[0] if purls else "N/A"}\n'
112+
f'License: {licenses[0].get("name", "N/A") if licenses else "N/A"}\n'
113+
)
114+
115+
116+
class Results:
54117
"""
55118
SCANOSS Results class \n
56119
Handles the parsing and filtering of the scan results
@@ -80,10 +143,17 @@ def __init__( # noqa: PLR0913
80143
output_format (str, optional): Output format. Defaults to None.
81144
"""
82145

83-
AbstractPresenter.__init__(self, output_file=output_file, output_format=output_format)
84-
ScanossBase.__init__(self, debug, trace, quiet)
146+
self.base = ScanossBase(debug, trace, quiet)
85147
self.data = self._load_and_transform(filepath)
86148
self.filters = self._load_filters(match_type=match_type, status=status)
149+
self.presenter = ResultsPresenter(
150+
self,
151+
debug=debug,
152+
trace=trace,
153+
quiet=quiet,
154+
output_file=output_file,
155+
output_format=output_format,
156+
)
87157

88158
def load_file(self, file: str) -> Dict[str, Any]:
89159
"""Load the JSON file
@@ -98,7 +168,7 @@ def load_file(self, file: str) -> Dict[str, Any]:
98168
try:
99169
return json.load(jsonfile)
100170
except Exception as e:
101-
self.print_stderr(f'ERROR: Problem parsing input JSON: {e}')
171+
self.base.print_stderr(f'ERROR: Problem parsing input JSON: {e}')
102172

103173
def _load_and_transform(self, file: str) -> List[Dict[str, Any]]:
104174
"""
@@ -173,8 +243,8 @@ def _item_matches_filters(self, item):
173243
def _validate_filter_values(filter_key: str, filter_value: List[str]):
174244
if any(value not in AVAILABLE_FILTER_VALUES.get(filter_key, []) for value in filter_value):
175245
valid_values = ', '.join(AVAILABLE_FILTER_VALUES.get(filter_key, []))
176-
raise Exception(
177-
f"ERROR: Invalid filter value '{filter_value}' for filter '{filter_key.value}'. "
246+
raise ValueError(
247+
f"ERROR: Invalid filter value '{filter_value}' for filter '{filter_key}'. "
178248
f'Valid values are: {valid_values}'
179249
)
180250

@@ -188,51 +258,6 @@ def get_pending_identifications(self):
188258
def has_results(self):
189259
return bool(self.data)
190260

191-
def _format_json_output(self) -> str:
192-
"""
193-
Format the output data into a JSON object
194-
"""
195-
196-
formatted_data = []
197-
for item in self.data:
198-
formatted_data.append(
199-
{
200-
'file': item.get('filename'),
201-
'status': item.get('status', 'N/A'),
202-
'match_type': item['id'],
203-
'matched': item.get('matched', 'N/A'),
204-
'purl': (item.get('purl')[0] if item.get('purl') else 'N/A'),
205-
'license': (item.get('licenses')[0].get('name', 'N/A') if item.get('licenses') else 'N/A'),
206-
}
207-
)
208-
return json.dumps({'results': formatted_data, 'total': len(formatted_data)}, indent=2)
209-
210-
def _format_plain_output(self) -> str:
211-
"""Format the output data into a plain text string
212-
213-
Returns:
214-
str: The formatted output data
215-
"""
216-
if not self.data:
217-
msg = 'No results to present'
218-
self.print_stderr(msg)
219-
return msg
220-
221-
formatted = ''
222-
for item in self.data:
223-
formatted += f'{self._format_plain_output_item(item)}\n'
224-
return formatted
225-
226-
@staticmethod
227-
def _format_plain_output_item(item):
228-
purls = item.get('purl', [])
229-
licenses = item.get('licenses', [])
230-
231-
return (
232-
f'File: {item.get("filename")}\n'
233-
f'Match type: {item.get("id")}\n'
234-
f'Status: {item.get("status", "N/A")}\n'
235-
f'Matched: {item.get("matched", "N/A")}\n'
236-
f'Purl: {purls[0] if purls else "N/A"}\n'
237-
f'License: {licenses[0].get("name", "N/A") if licenses else "N/A"}\n'
238-
)
261+
def present(self, output_format: str = None, output_file: str = None):
262+
"""Present the results in the selected format"""
263+
self.presenter.present(output_format=output_format, output_file=output_file)

Diff for: src/scanoss/scanners/scanner_hfh.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def __init__(self, path: str, key: bytes, key_str: str):
6060
self.key_str = key_str
6161

6262

63-
class ScannerHFH(AbstractPresenter, ScanossBase):
63+
class ScannerHFH:
6464
"""
6565
Folder Hashing Scanner.
6666
@@ -84,13 +84,12 @@ def __init__(
8484
client (ScanossGrpc): gRPC client for communicating with the scanning service.
8585
scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
8686
"""
87-
AbstractPresenter.__init__(
88-
self,
87+
self.base = ScanossBase(
8988
debug=config.debug,
9089
trace=config.trace,
9190
quiet=config.quiet,
9291
)
93-
ScanossBase.__init__(
92+
self.presenter = ScannerHFHPresenter(
9493
self,
9594
debug=config.debug,
9695
trace=config.trace,
@@ -159,11 +158,12 @@ def _build_root_node(self, path: str) -> DirectoryNode:
159158
Returns:
160159
DirectoryNode: The root node representing the directory.
161160
"""
162-
163161
root = Path(path).resolve()
164162
root_node = DirectoryNode(str(root))
165163

166-
filtered_files = [Path(f) for f in self.file_filters.get_filtered_files_from_folder(path)]
164+
filtered_files = [
165+
f for f in root.rglob('*') if f.is_file() and not self.file_filters._should_skip_file_for_hfh(f)
166+
]
167167
# Sort the files by name to ensure the hash is the same for the same folder
168168
filtered_files.sort()
169169

@@ -281,19 +281,36 @@ def _head_calc(self, sim_hash: int) -> int:
281281
# Shift right by 4 bits and extract the lowest 8 bits
282282
return (total >> 4) & 0xFF
283283

284+
def present(self, output_format: str = None, output_file: str = None):
285+
"""Present the results in the selected format"""
286+
self.presenter.present(output_format=output_format, output_file=output_file)
287+
288+
289+
class ScannerHFHPresenter(AbstractPresenter):
290+
"""
291+
ScannerHFH presenter class
292+
Handles the presentation of the folder hashing scan results
293+
"""
294+
295+
def __init__(self, scanner: ScannerHFH, **kwargs):
296+
super().__init__(**kwargs)
297+
self.scanner = scanner
298+
284299
def _format_json_output(self) -> str:
285300
"""
286301
Format the scan output data into a JSON object
287302
288303
Returns:
289304
str: The formatted JSON string
290305
"""
291-
return json.dumps(self.scan_results, indent=2)
306+
return json.dumps(self.scanner.scan_results, indent=2)
292307

293308
def _format_plain_output(self) -> str:
294309
"""
295310
Format the scan output data into a plain text string
296311
"""
297312
return (
298-
json.dumps(self.scan_results, indent=2) if isinstance(self.scan_results, dict) else str(self.scan_results)
313+
json.dumps(self.scanner.scan_results, indent=2)
314+
if isinstance(self.scanner.scan_results, dict)
315+
else str(self.scanner.scan_results)
299316
)

Diff for: src/scanoss/utils/abstract_presenter.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
AVAILABLE_OUTPUT_FORMATS = ['json', 'plain']
66

77

8-
class AbstractPresenter(ABC, ScanossBase):
8+
class AbstractPresenter(ABC):
99
"""
1010
Abstract presenter class for presenting output in a given format.
1111
Subclasses must implement the _format_json_output and _format_plain_output methods.
@@ -22,7 +22,7 @@ def __init__(
2222
"""
2323
Initialize the presenter with the given output file and format.
2424
"""
25-
super().__init__(debug=debug, trace=trace, quiet=quiet)
25+
self.base = ScanossBase(debug=debug, trace=trace, quiet=quiet)
2626
self.output_file = output_file
2727
self.output_format = output_format
2828

@@ -51,7 +51,7 @@ def _present_output(self, content: str, file_path: str = None):
5151
"""
5252
If a file path is provided, write to that file; otherwise, print the content to stdout.
5353
"""
54-
self.print_to_file_or_stdout(content, file_path)
54+
self.base.print_to_file_or_stdout(content, file_path)
5555

5656
@abstractmethod
5757
def _format_json_output(self) -> str:

0 commit comments

Comments
 (0)