Skip to content

Feature: Add SARIF Support for URL Checker #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Build and Deploy containers

on:
# Always test on pull request
pull_request: []
pull_request:

# Deploy on merge to main
push:
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ on:
branches:
- master
pull_request:
branches_ignore: []

jobs:
formatting:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,6 @@ venv.bak/

# mypy
.mypy_cache/

# vscode
.vscode/
57 changes: 55 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ $ urlchecker check --help
usage: urlchecker check [-h] [-b BRANCH] [--subfolder SUBFOLDER] [--cleanup] [--serial] [--no-check-certs]
[--force-pass] [--no-print] [--verbose] [--file-types FILE_TYPES] [--files FILES]
[--exclude-urls EXCLUDE_URLS] [--exclude-patterns EXCLUDE_PATTERNS]
[--exclude-files EXCLUDE_FILES] [--save SAVE] [--retry-count RETRY_COUNT] [--timeout TIMEOUT]
[--exclude-files EXCLUDE_FILES] [--save SAVE] [--format FORMAT] [--retry-count RETRY_COUNT] [--timeout TIMEOUT]
path

positional arguments:
Expand Down Expand Up @@ -89,6 +89,7 @@ options:
--exclude-files EXCLUDE_FILES
comma separated list of files and patterns to exclude (no spaces)
--save SAVE Path to a csv file to save results to.
--format FORMAT format to save results to (csv or sarif), defaults to csv.
--retry-count RETRY_COUNT
retry count upon failure (defaults to 2, one retry).
--timeout TIMEOUT timeout (seconds) to provide to the requests library (defaults to 5)
Expand Down Expand Up @@ -266,7 +267,9 @@ $ urlchecker check --exclude-files=README.md,_config.yml

### Save Results

If you want to save your results to file, perhaps for some kind of record or
#### Save results in CSV format

If you want to save your results to file in csv format, perhaps for some kind of record or
other data analysis, you can provide the `--save` argument:

```bash
Expand Down Expand Up @@ -313,6 +316,56 @@ https://github.com/SuperKogito/URLs-checker/issues/1,failed
https://github.com/SuperKogito/URLs-checker/issues/4,failed
```

#### Save results in SARIF format

To save results in SARIF format, you can provide the `--format` argument with `sarif`:

```bash
$ urlchecker check --save results.sarif --format sarif .
```

This will produce a SARIF file with detailed information about each URL, including the exact line in the code where the URL was found, useful for integrating with tools that support SARIF for static analysis.
This output helps in pinpointing the exact issues directly in the code, improving the efficiency of addressing broken links.

```json
{
"version": "2.1.0",
"runs": [
{
"tool": {
"driver": {
"name": "UrlChecker",
"informationUri": "https://github.com/urlstechie/urlchecker-python",
"rules": [
{
"id": "URL001",
"name": "Invalid URL",
"shortDescription": { "text": "This URL is invalid or unreachable." },
"fullDescription": { "text": "This URL is invalid or unreachable." },
"helpUri": "https://example.com/rule/url001"
}
]
}
},
"results": [
{
"ruleId": "URL001",
"message": { "text": "URL https://github.com/SuperKogito/URLs-checker/README.md is invalid or unreachable." },
"locations": [
{
"physicalLocation": {
"artifactLocation": { "uri": "example_file.py" },
"region": { "startLine": 10 }
}
}
]
},
...
]
}
]
}
```

### Usage from Python

Expand Down
52 changes: 52 additions & 0 deletions tests/test_client_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,55 @@ def test_client_save(save):
if save:
if not os.path.exists(output_csv.name):
raise AssertionError

@pytest.mark.parametrize("save, output_format", [(True, ""), (True, "csv"), (True, "sarif")])
def test_client_save_format_csv(save, output_format):

# init config parser
config = configparser.ConfigParser()
config.read("./tests/_local_test_config.conf")

# init env variables
path = config["DEFAULT"]["git_path_test_value"]
file_types = config["DEFAULT"]["file_types_test_values"]
exclude_urls = config["DEFAULT"]["exclude_test_urls"]
exclude_patterns = config["DEFAULT"]["exclude_test_patterns"]

# Generate command
cmd = [
"urlchecker",
"check",
"--subfolder",
"test_files",
"--file-types",
file_types,
"--exclude-files",
"conf.py",
"--exclude-urls",
exclude_urls,
"--exclude_patterns",
exclude_patterns,
]

suffix = {
"csv": ".csv",
"sarif": ".sarif",
"" : ".csv"
}
# Write to file
if save:
output_file = tempfile.NamedTemporaryFile(suffix=suffix[output_format], prefix="urlchecker-")
cmd += ["--save", output_file.name]
if output_format:
cmd += ["--format", output_format]

# Add final path
cmd.append(path)

print(" ".join(cmd))
# execute script
pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if save:
if not os.path.exists(output_file.name):
raise AssertionError

74 changes: 72 additions & 2 deletions tests/test_core_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import sys
import pytest
import json
import configparser
from urlchecker.core.fileproc import get_file_paths
from urlchecker.main.github import clone_repo
Expand Down Expand Up @@ -95,9 +96,8 @@ def test_locally(local_folder_path, config_fname):
)
print("Done.")


@pytest.mark.parametrize("retry_count", [1, 3])
def test_check_run_save(tmp_path, retry_count):
def test_check_run_save_csv(tmp_path, retry_count):

# init vars
git_path = "https://github.com/urlstechie/urlchecker-test-repo"
Expand Down Expand Up @@ -161,3 +161,73 @@ def test_check_run_save(tmp_path, retry_count):
for line in lines[1:]:
url, result, filename = line.split(",")
assert not filename.startswith(root)

@pytest.mark.parametrize("retry_count", [1, 3])
def test_check_run_save_sarif(tmp_path, retry_count):

# init vars
git_path = "https://github.com/urlstechie/urlchecker-test-repo"
file_types = [".py", ".md"]
print_all = True
exclude_urls = [
"https://superkogito.github.io/figures/fig2.html",
"https://superkogito.github.io/figures/fig4.html",
]
exclude_patterns = ["https://superkogito.github.io/tables"]
timeout = 1
force_pass = False

# clone repo
base_path = clone_repo(git_path)

# get all file paths in subfolder specified
base_path = os.path.join(base_path, "test_files")
file_paths = get_file_paths(base_path, file_types)

# check repo urls
checker = UrlChecker(print_all=print_all, save_results_format="sarif")
check_results = checker.run(
file_paths=file_paths,
exclude_urls=exclude_urls,
exclude_patterns=exclude_patterns,
retry_count=retry_count,
timeout=timeout,
)

# Test saving to file
output_file = os.path.join(str(tmp_path), "results.sarif")
assert not os.path.exists(output_file)
saved_file = checker.save_results(output_file)
assert os.path.exists(output_file)

# Read in output file
with open(saved_file, "r") as file:
sarif_output = json.load(file)

# Verify SARIF output structure
assert "version" in sarif_output
assert sarif_output["version"] == "2.1.0"
assert "runs" in sarif_output
assert len(sarif_output["runs"]) > 0
assert "tool" in sarif_output["runs"][0]
assert "driver" in sarif_output["runs"][0]["tool"]
assert "name" in sarif_output["runs"][0]["tool"]["driver"]
assert sarif_output["runs"][0]["tool"]["driver"]["name"] == "UrlChecker"
assert "results" in sarif_output["runs"][0]

# Verify at least one result entry
assert len(sarif_output["runs"][0]["results"]) > 0

# Verify the structure of a result entry
result_entry = sarif_output["runs"][0]["results"][0]
assert "ruleId" in result_entry
assert result_entry["ruleId"] == "URL001"
assert "message" in result_entry
assert "text" in result_entry["message"]
assert "locations" in result_entry
assert len(result_entry["locations"]) > 0
assert "physicalLocation" in result_entry["locations"][0]
assert "artifactLocation" in result_entry["locations"][0]["physicalLocation"]
assert "uri" in result_entry["locations"][0]["physicalLocation"]["artifactLocation"]
assert "region" in result_entry["locations"][0]["physicalLocation"]
assert "startLine" in result_entry["locations"][0]["physicalLocation"]["region"]
9 changes: 8 additions & 1 deletion urlchecker/client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,16 @@ def get_parser():

check.add_argument(
"--save",
help="Path to a csv file to save results to.",
help="Path to file to save results to.",
default=None,
)
check.add_argument(
"--format",
help="File format to save results to.",
default="csv",
choices=["csv", "sarif"],
type=str,
)

# Timeouts

Expand Down
2 changes: 2 additions & 0 deletions urlchecker/client/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def main(args, extra):
print(" force pass: %s" % args.force_pass)
print(" retry count: %s" % args.retry_count)
print(" save: %s" % args.save)
print(" format: %s" % args.format)
print(" timeout: %s" % args.timeout)

# Instantiate a new checker with provided arguments
Expand All @@ -86,6 +87,7 @@ def main(args, extra):
exclude_files=exclude_files,
print_all=not args.no_print,
serial=args.serial,
save_results_format=args.format,
)
check_results = checker.run(
exclude_urls=exclude_urls,
Expand Down
Loading