Skip to content

Commit b8ece1a

Browse files
authored
Add large file logging support (#116)
1 parent 54ad36b commit b8ece1a

5 files changed

Lines changed: 162 additions & 58 deletions

File tree

CHANGES.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# [1.17.0](https://github.com/ComplianceAsCode/auditree-framework/releases/tag/v1.17.0)
2+
3+
- [ADDED] Locker get_large_files method added to return large files in the locker.
4+
- [ADDED] Logging of large files added to remote push operation.
5+
16
# [1.16.0](https://github.com/ComplianceAsCode/auditree-framework/releases/tag/v1.16.0)
27

38
- [ADDED] Locker get_empty_evidences method added to return all empty evidence paths.

compliance/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414
# limitations under the License.
1515
"""Compliance automation package."""
1616

17-
__version__ = '1.16.0'
17+
__version__ = '1.17.0'

compliance/locker.py

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545
AE_DEFAULT = 30 * DAY
4646
READMES = ['README.md', 'readme.md', 'Readme.md']
4747
AE_EXEMPT = [INDEX_FILE] + READMES
48+
NOT_EVIDENCE = AE_EXEMPT
49+
KB = 1000
50+
MB = KB * 1000
51+
LF_DEFAULT = 50 * MB
4852

4953

5054
class Locker(object):
@@ -635,6 +639,7 @@ def push(self):
635639
)
636640
remote.fetch()
637641
remote.pull(rebase=True)
642+
self._log_large_files()
638643
self.logger.info(
639644
f'Pushing local locker to remote repo {self.repo_url}...'
640645
)
@@ -704,38 +709,43 @@ def get_abandoned_evidences(self, threshold=None):
704709
705710
:returns: a set of abandoned evidence file relative paths.
706711
"""
707-
abandoned_evidence = []
708-
tree = self.repo.head.commit.tree
709-
for f in tree.traverse():
710-
if (f.type != 'blob' or f.path.startswith('notifications/')
711-
or f.path == 'check_results.json'
712-
or f.path.split('/').pop() in AE_EXEMPT):
713-
continue
712+
abandoned_evidence = set()
713+
for f in self._get_git_files('evidence'):
714714
metadata = self.get_evidence_metadata(f.path, dt.utcnow())
715715
if self._evidence_abandoned(metadata, threshold):
716-
abandoned_evidence.append(f.path)
717-
return set(abandoned_evidence)
716+
abandoned_evidence.add(f.path)
717+
return abandoned_evidence
718718

719719
def get_empty_evidences(self):
720720
"""
721721
Provide a list of evidence paths to empty evidence files.
722722
723-
Evidence content is deemed empty based on an evidence object's
723+
Evidence content is considered empty based on an evidence object's
724724
is_empty property. This information is stored in evidence metadata.
725725
726726
:returns: a list of empty evidence file relative paths.
727727
"""
728728
empty_evidence = []
729-
tree = self.repo.head.commit.tree
730-
for idx_file in [f for f in tree.traverse() if is_index_file(f.path)]:
731-
metadata = json.loads(idx_file.data_stream.read())
732-
for ev_name, ev_meta in metadata.items():
729+
for f in self._get_git_files('index'):
730+
for ev_name, ev_meta in json.loads(f.data_stream.read()).items():
733731
if ev_meta.get('empty', False):
734732
empty_evidence.append(
735-
str(PurePath(PurePath(idx_file.path).parent, ev_name))
733+
str(PurePath(f.path).with_name(ev_name))
736734
)
737735
return empty_evidence
738736

737+
def get_large_files(self, size=LF_DEFAULT):
738+
"""
739+
Provide a dictionary of "large" evidence locker files.
740+
741+
A "large" file is one whose size is > the ``size`` argument provided.
742+
743+
:param int size: file size threshold.
744+
745+
:returns: a dictionary of file paths and sizes of "large" files.
746+
"""
747+
return {f.path: f.size for f in self._get_git_files() if f.size > size}
748+
739749
def delete_repo_locally(self):
740750
"""Remove the local git repository."""
741751
try:
@@ -935,7 +945,52 @@ def _validate_evidence(self, evidence, ignore_ttl):
935945
if not ignore_ttl and ttl_expired:
936946
raise StaleEvidenceError(f'Evidence {evidence.path} is stale')
937947

948+
def _get_git_files(self, file_type='all'):
949+
iz = {
950+
'all': lambda g: g.type == 'blob',
951+
'evidence': is_evidence_file,
952+
'index': is_index_file
953+
}
954+
return filter(iz[file_type], self.repo.head.commit.tree.traverse())
955+
956+
def _log_large_files(self):
957+
large_files = self.get_large_files(
958+
get_config().get('locker.large_file_threshold', LF_DEFAULT)
959+
)
960+
if large_files:
961+
msg = ['LARGE FILES (Hosting service may reject due to size):\n']
962+
for fpath, size in large_files.items():
963+
formatted_size = f'{size/MB:.1f} MB'
964+
if formatted_size == '0.0 MB':
965+
formatted_size = f'{str(size)} Bytes'
966+
msg.append(f' {fpath} is {formatted_size}')
967+
self.logger.info('\n'.join(msg) + '\n')
968+
938969

939-
def is_index_file(path):
940-
"""Confirm whether the supplied path is to an index file."""
941-
return os.path.basename(path) == INDEX_FILE
970+
def is_evidence_file(git_obj):
971+
"""
972+
Confirm whether the supplied git object is an evidence file.
973+
974+
:param git_obj: A GitPython object
975+
976+
:returns: True or False (Object is or isn't an evidence file)
977+
"""
978+
return (
979+
git_obj.type == 'blob'
980+
and not git_obj.path.startswith('notifications/')
981+
and git_obj.path != 'check_results.json'
982+
and PurePath(git_obj.path).name not in NOT_EVIDENCE
983+
)
984+
985+
986+
def is_index_file(obj):
987+
"""
988+
Confirm whether the supplied object is, or points to, an index file.
989+
990+
:param obj: Either a GitPython object or a relative file path as a string
991+
992+
:returns: True or False (Object is or isn't an index file)
993+
"""
994+
if isinstance(obj, str):
995+
return PurePath(obj).name == INDEX_FILE
996+
return obj.type == 'blob' and PurePath(obj.path).name == INDEX_FILE

doc-source/design-principles.rst

Lines changed: 64 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,16 @@ evidence (see :py:mod:`compliance.evidence`):
4747
See :ref:`fetchers` section for conventions and expectations with
4848
respect to modifying RawEvidence.
4949

50-
All evidence has a ``ttl`` field (Time To Live) which defines for how
51-
long an evidence should be considered valid. For instance, healthcheck
52-
data is only valid during 1 day since new input is generated everyday. For
53-
this reason, any check trying to use an evidence with an expired ``ttl``
54-
must error.
50+
All evidence has a settable ``ttl`` (Time To Live) property that defines how
51+
long an evidence should be considered valid. For instance, if new data is
52+
generated on a daily basis then evidence gathered for that data should only be
53+
valid for 1 day. For this reason, any check trying to use evidence with an
54+
expired ``ttl`` will error.
55+
56+
All evidence has an ``is_empty`` property that defines an evidence's empty
57+
state. This provides value when monitoring evidence content for completness.
58+
The property can be overridden to define "empty" for any given evidence. By default evidence is considered empty if it has no content, is all whitespace,
59+
or if it is JSON and is an empty dictionary or list (``{}``, ``[]``).
5560

5661

5762
Evidence Locker
@@ -89,22 +94,22 @@ for:
8994

9095
* Validating the ``ttl`` for a given evidence. An optional evidence
9196
``ttl`` tolerance value can be configured to be applied during
92-
fetcher execution. Check execution is not affected by this optional
93-
tolerance value because checks should only interact with evidence that
94-
is fresh (not stale). This value (in seconds) tells fetchers to
97+
fetcher execution. This value (in seconds) tells fetchers to
9598
retrieve evidence that is nearly but not yet stale. If no value is
9699
supplied then fetchers will only retrieve new evidence after ``ttl``
97100
has expired. You can set the optional ``ttl_tolerance`` value in
98-
your configuration JSON file like so:
101+
your configuration JSON file like so::
99102

100-
.. code-block:: json
103+
{
104+
"locker": {
105+
"repo_url": "https://github.com/my-org/my-evidence-repo",
106+
"ttl_tolerance": 3600
107+
}
108+
}
101109

102-
{
103-
"locker": {
104-
"repo_url": "https://github.com/my-org/my-evidence-repo",
105-
"ttl_tolerance": 3600
106-
}
107-
}
110+
Check execution is not affected by this optional
111+
tolerance value because checks should only interact with evidence that
112+
is fresh (not stale).
108113

109114
* It's generally a good idea to regularly "archive" an evidence locker in
110115
favor of a fresh one. A yearly locker archive/refresh is a good guideline
@@ -114,38 +119,58 @@ for:
114119
locker is possible by using the ``prev_repo_url`` option. With that
115120
option set, a check that is unable to find historical evidence in the
116121
current evidence locker will be able to download the previous locker and
117-
look for the historical evidence there. This will continue to do this until
118-
the new locker is primed with enough historical evidence to support all
119-
checks. Setting the option in your configuration JSON file would look
120-
similar to:
122+
look for the historical evidence there. Setting the option in your configuration JSON file would look
123+
similar to::
121124

122-
.. code-block:: json
125+
{
126+
"locker": {
127+
"repo_url": "https://github.com/my-org/my-evidence-repo",
128+
"prev_repo_url": "https://github.com/my-org/my-evidence-repo-old"
129+
}
130+
}
123131

124-
{
125-
"locker": {
126-
"repo_url": "https://github.com/my-org/my-evidence-repo",
127-
"prev_repo_url": "https://github.com/my-org/my-evidence-repo-old"
128-
}
129-
}
132+
The previous locker will no longer be downloaded once the new locker is
133+
primed with enough historical evidence to support all checks.
130134

131135
* A locker can grow large, causing CI/CD jobs to run longer than desired
132136
due to locker download time. So in addition to a sound locker archiving
133137
strategy, it is also possible to configure your locker to only download
134-
recent commits by using the ``shallow_days`` option. When ``shallow_days``
135-
is supplied, only commits since the current date minus the number of days set
136-
as ``shallow_days`` are included in the locker download. The option applies
137-
to both the locker and the previous locker (if applicable). Setting the
138+
recent commits by using the ``shallow_days`` option. Setting the
138139
option in your configuration JSON file would look similar to::
139140

140-
.. code-block:: json
141+
{
142+
"locker": {
143+
"repo_url": "https://github.com/my-org/my-evidence-repo",
144+
"prev_repo_url": "https://github.com/my-org/my-evidence-repo-old",
145+
"shallow_days": 10
146+
}
147+
}
148+
149+
When ``shallow_days`` is supplied, only commits since the current date minus
150+
the number of days set as ``shallow_days`` are included in the locker
151+
download. The option applies to both the locker and the previous locker (if
152+
applicable).
153+
154+
* Remote hosting services (Github, Gitlab, BitBucket) typically have file size
155+
limitations that can vary from service instance to service instance.
156+
Exceeding a maximum file size will in turn cause the service managing your
157+
evidence locker to reject a remote locker Git push request. Unfortunately
158+
rejection notices from a service aren't always the most descriptive so it
159+
often isn't clear why your push request was rejected. To that end, prior to
160+
a remote push, the framework will log a list of "largely sized" files. The
161+
large file size threshold is configurable and can be set by using the
162+
``large_file_threshold`` option. The value is in bytes and defaults to
163+
50 MB. Setting the option in your configuration JSON file would look similar
164+
to::
165+
166+
{
167+
"locker": {
168+
"repo_url": "https://github.com/my-org/my-evidence-repo",
169+
"large_file_threshold": 50000000
170+
}
171+
}
141172

142-
{
143-
"locker": {
144-
"repo_url": "https://github.com/my-org/my-evidence-repo",
145-
"prev_repo_url": "https://github.com/my-org/my-evidence-repo-old",
146-
"shallow_days": 10
147-
}
148-
}
173+
This should hopefully add some detail to a remote Git push rejection.
149174

150175

151176
.. _fetchers:

test/t_compliance/t_locker/test_locker.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,25 @@ def test_empty_evidence(self):
281281
]
282282
)
283283

284+
def test_large_files(self):
285+
"""Test paths to large files are returned."""
286+
with Locker(name=REPO_DIR) as locker:
287+
large = RawEvidence(
288+
'large.txt', 'test_category', DAY, 'Large evidence'
289+
)
290+
large.set_content('X' * 10000)
291+
locker.add_evidence(large)
292+
small = RawEvidence(
293+
'small.txt', 'test_category', DAY, 'Small evidence'
294+
)
295+
small.set_content('X' * 10)
296+
locker.add_evidence(small)
297+
locker.checkin()
298+
self.assertEqual(
299+
locker.get_large_files(9999),
300+
{'raw/test_category/large.txt': 10000}
301+
)
302+
284303
def test_add_partitioned_evidence(self):
285304
"""Test that partitioned evidence is added to locker as expected."""
286305
with Locker(name=REPO_DIR) as locker:

0 commit comments

Comments
 (0)