Skip to content

Commit 43583e9

Browse files
authored
Use filter-repo blob callback instead of filtering via git lfs ls-files (#4551)
When an LFS file is renamed (e.g. `libHarfBuzzSharp.so.0.60830.0` to `libHarfBuzzSharp.so`) the LFS blob stays the same. `git lfs ls-files --all --deleted` deduplicates by blob OID, so it only reports the new filename. The old filename disappears from the list, changing historical commits and cascading a full history rewrite. Replace the path-based LFS exclusion with inline blob content detection using git-filter-repo's `blob_callback`. Any blob starting with the LFS pointer header is stripped regardless of filename, making the filter immune to renames
1 parent 2b4b1d0 commit 43583e9

File tree

2 files changed

+123
-46
lines changed

2 files changed

+123
-46
lines changed

engine/Tools/SboxBuild/Steps/SyncPublicRepo.cs

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,16 @@ private bool SyncToPublicRepository()
194194
return false;
195195
}
196196

197-
// Make sure we filter out lfs files that are in the history as well
198-
var allLfsPaths = GetAllPublicLfsFiles( relativeFilteredPath );
199-
if ( allLfsPaths is null )
197+
// Run git-filter-repo to filter out unwanted paths.
198+
// LFS pointer blobs are detected and stripped inline by the Python
199+
// filter (blob content inspection) so we no longer need to pass a
200+
// pre-computed LFS path list.
201+
if ( !RunFilterRepo( relativeFilteredPath ) )
200202
{
201203
return false;
202204
}
203205

204-
// Run git-filter-repo to filter out unwanted paths
205-
if ( !RunFilterRepo( relativeFilteredPath, allLfsPaths ) )
206+
if ( !ValidateFilteredRepository( relativeFilteredPath ) )
206207
{
207208
return false;
208209
}
@@ -391,7 +392,7 @@ private static bool TryUploadLfsArtifacts( string repoRoot, IReadOnlyCollection<
391392
return TryUploadArtifacts( candidates, remoteBase, artifacts, uploadedHashes, "LFS", skipUpload );
392393
}
393394

394-
private bool RunFilterRepo( string relativeRepoPath, IReadOnlyCollection<string> lfsPaths )
395+
private bool RunFilterRepo( string relativeRepoPath )
395396
{
396397
Log.Info( "Running git-filter-repo to filter paths..." );
397398

@@ -407,11 +408,7 @@ private bool RunFilterRepo( string relativeRepoPath, IReadOnlyCollection<string>
407408
IncludeGlobs = RepoFilterPathIncludeGlobs,
408409
ExcludeGlobs = RepoFilterPathExcludeGlobs,
409410
WhitelistedShaders = RepoFilterShaderWhitelistGlobs,
410-
PathRenames = RepoFilterPathRenames.ToDictionary( pair => pair.Key, pair => pair.Value, StringComparer.OrdinalIgnoreCase ),
411-
LfsPaths = lfsPaths
412-
.Select( ToForwardSlash )
413-
.Distinct( StringComparer.OrdinalIgnoreCase )
414-
.ToList()
411+
PathRenames = RepoFilterPathRenames.ToDictionary( pair => pair.Key, pair => pair.Value, StringComparer.OrdinalIgnoreCase )
415412
};
416413

417414
string configPath = null;
@@ -441,6 +438,54 @@ private bool RunFilterRepo( string relativeRepoPath, IReadOnlyCollection<string>
441438
}
442439
}
443440

441+
private static readonly HashSet<string> ForbiddenRepoExtensions = new( StringComparer.OrdinalIgnoreCase )
442+
{
443+
".lib", ".exe", ".pdb", ".a", ".dll", ".dylib", ".so",
444+
".png", ".tga", ".jpg", ".psd", ".pdf", ".bmp", ".gif", ".exr", ".ico", ".svg", ".tif", ".tiff",
445+
".ttf", ".otf",
446+
".dmx", ".fbx", ".max",
447+
".wav", ".ogg", ".mp3", ".mp4", ".webm", ".avi",
448+
".pyd", ".ppf", ".vsix", ".vcs", ".bin", ".dat", ".jar", ".spv", ".ma", ".lxo"
449+
};
450+
451+
private static bool ValidateFilteredRepository( string relativeRepoPath )
452+
{
453+
Log.Info( "Validating filtered repository before push..." );
454+
455+
var renamedTargets = new HashSet<string>( RepoFilterPathRenames.Values, StringComparer.OrdinalIgnoreCase );
456+
var matcher = RepoFileFilter();
457+
var violations = new List<string>();
458+
459+
Utility.RunProcess( "git", "ls-tree -r --name-only HEAD", relativeRepoPath, onDataReceived: ( _, e ) =>
460+
{
461+
if ( string.IsNullOrWhiteSpace( e.Data ) )
462+
return;
463+
464+
var file = ToForwardSlash( e.Data.Trim() );
465+
466+
if ( file.StartsWith( "src/", StringComparison.OrdinalIgnoreCase ) )
467+
violations.Add( $"Private source code: {file}" );
468+
469+
if ( !renamedTargets.Contains( file ) && !matcher.Match( file ).HasMatches )
470+
violations.Add( $"Outside include rules: {file}" );
471+
472+
var ext = Path.GetExtension( file );
473+
if ( !string.IsNullOrEmpty( ext ) && ForbiddenRepoExtensions.Contains( ext ) )
474+
violations.Add( $"Forbidden extension ({ext}): {file}" );
475+
} );
476+
477+
if ( violations.Count > 0 )
478+
{
479+
Log.Error( $"Filtered repository contains {violations.Count} violation(s):" );
480+
foreach ( var v in violations )
481+
Log.Error( $" {v}" );
482+
return false;
483+
}
484+
485+
Log.Info( "Filtered repository validation passed" );
486+
return true;
487+
}
488+
444489
private string PushToPublicRepository( string relativeRepoPath )
445490
{
446491
Log.Info( "Pushing filtered repository to public..." );
@@ -531,26 +576,6 @@ private static HashSet<string> GetCurrentLfsFiles( string relativeRepoPath )
531576
return trackedFiles;
532577
}
533578

534-
private static HashSet<string> GetAllPublicLfsFiles( string relativeRepoPath )
535-
{
536-
var trackedFiles = GetCurrentLfsFiles( relativeRepoPath );
537-
538-
if ( !Utility.RunProcess( "git", "lfs ls-files --all --deleted --name-only", relativeRepoPath, onDataReceived: ( _, e ) =>
539-
{
540-
if ( string.IsNullOrWhiteSpace( e.Data ) )
541-
{
542-
return;
543-
}
544-
545-
trackedFiles.Add( ToForwardSlash( e.Data.Trim() ) );
546-
} ) )
547-
{
548-
Log.Error( "Failed to list historical LFS tracked files" );
549-
return null;
550-
}
551-
552-
return trackedFiles;
553-
}
554579

555580
private void WriteDryRunOutputs( string commitHash, IEnumerable<ArtifactFileInfo> artifacts )
556581
{
@@ -848,8 +873,5 @@ private sealed class FilterConfigData
848873

849874
[JsonPropertyName( "path_renames" )]
850875
public Dictionary<string, string> PathRenames { get; init; }
851-
852-
[JsonPropertyName( "lfs_paths" )]
853-
public List<string> LfsPaths { get; init; }
854876
}
855877
}

engine/Tools/SboxBuild/Steps/SyncPublicRepoFilter.py

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@
44

55
import argparse
66
import json
7+
import posixpath
78
import sys
89
from pathlib import PurePosixPath
910
from typing import Dict, Iterable, List, Optional, Set
1011
import git_filter_repo as fr
1112

13+
_LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1"
14+
15+
1216
class FilenameFilter:
13-
"""Applies include/exclude rules and renames for git-filter-repo."""
17+
"""Applies include/exclude rules and renames."""
1418

1519
def __init__(self, config: Dict[str, object]) -> None:
1620
self._include_globs = tuple(_normalise_glob(p) for p in config.get("include_globs", []) or [])
@@ -23,17 +27,11 @@ def __init__(self, config: Dict[str, object]) -> None:
2327
for src, dest in renames.items()
2428
}
2529

26-
lfs_paths: Iterable[str] = config.get("lfs_paths", []) or []
27-
self._lfs_paths: Set[str] = { _normalise_path(path) for path in lfs_paths }
28-
2930
def __call__(self, filename: bytes) -> Optional[bytes]:
3031
path_text = filename.decode("utf-8", "ignore")
3132
normalised = _normalise_path(path_text)
3233
path = PurePosixPath(normalised)
3334

34-
if normalised in self._lfs_paths:
35-
return None
36-
3735
allowed = _matches_any_glob(path, self._include_globs)
3836

3937
if allowed and _matches_any_glob(path, self._exclude_globs):
@@ -52,15 +50,65 @@ def __call__(self, filename: bytes) -> Optional[bytes]:
5250
return filename
5351

5452

53+
class LfsPointerFilter:
54+
"""Strips LFS pointer blobs and dangling symlinks from commits."""
55+
56+
def __init__(self) -> None:
57+
self._lfs_blob_ids: Set[int] = set()
58+
self._symlink_targets: Dict[int, str] = {}
59+
self._stripped_paths: Set[str] = set()
60+
61+
def blob_callback(self, blob, _metadata) -> None:
62+
if blob.data.startswith(_LFS_POINTER_PREFIX):
63+
self._lfs_blob_ids.add(blob.id)
64+
elif len(blob.data) < 512:
65+
try:
66+
self._symlink_targets[blob.id] = blob.data.decode("utf-8").rstrip("\n")
67+
except UnicodeDecodeError:
68+
pass
69+
70+
def strip_lfs_from_commit(self, commit) -> None:
71+
original = commit.file_changes
72+
73+
stripped_this_commit: Set[str] = set()
74+
after_lfs = []
75+
for change in original:
76+
if change.blob_id in self._lfs_blob_ids:
77+
stripped_this_commit.add(change.filename.decode("utf-8", "replace"))
78+
continue
79+
after_lfs.append(change)
80+
81+
filtered = []
82+
for change in after_lfs:
83+
if change.mode == b"120000" and change.blob_id in self._symlink_targets:
84+
target = self._symlink_targets[change.blob_id]
85+
symlink_dir = PurePosixPath(change.filename.decode("utf-8", "replace")).parent
86+
resolved = posixpath.normpath(str(symlink_dir / target))
87+
if resolved in stripped_this_commit:
88+
stripped_this_commit.add(change.filename.decode("utf-8", "replace"))
89+
continue
90+
filtered.append(change)
91+
92+
self._stripped_paths.update(stripped_this_commit)
93+
commit.file_changes = filtered
94+
95+
def log_summary(self) -> None:
96+
print(f"[LfsPointerFilter] Detected {len(self._lfs_blob_ids)} LFS pointer blob(s)")
97+
print(f"[LfsPointerFilter] Stripped {len(self._stripped_paths)} unique path(s) from history")
98+
if self._stripped_paths:
99+
for path in sorted(self._stripped_paths):
100+
print(f" - {path}")
101+
102+
55103
class BaselineCommitCallback:
56-
"""Rewrites the root commit metadata for the public history."""
104+
"""Rewrites the root commit metadata."""
57105

58106
_base_message = (
59107
"Open source release\n\n"
60108
"This commit imports the C# engine code and game files, excluding C++ source code."
61109
)
62110

63-
def __call__(self, commit, metadata) -> None: # pylint: disable=unused-argument
111+
def __call__(self, commit, metadata) -> None:
64112
if commit.parents:
65113
return
66114

@@ -88,26 +136,33 @@ def _matches_any_glob(path: PurePosixPath, patterns: Iterable[str]) -> bool:
88136

89137

90138
def main(argv: Optional[List[str]] = None) -> int:
91-
parser = argparse.ArgumentParser(description="Run git-filter-repo with s&box filters")
92-
parser.add_argument("--config", required=True, help="Path to JSON configuration file")
139+
parser = argparse.ArgumentParser()
140+
parser.add_argument("--config", required=True)
93141
args = parser.parse_args(argv)
94142

95143
with open(args.config, "r", encoding="utf-8") as fp:
96144
config = json.load(fp)
97145

98146
filename_filter = FilenameFilter(config)
99-
commit_callback = BaselineCommitCallback()
147+
baseline_callback = BaselineCommitCallback()
148+
lfs_filter = LfsPointerFilter()
149+
150+
def commit_callback(commit, metadata):
151+
lfs_filter.strip_lfs_from_commit(commit)
152+
baseline_callback(commit, metadata)
100153

101154
options = fr.FilteringOptions.parse_args([], error_on_empty=False)
102155
options.force = True
103156

104157
repo_filter = fr.RepoFilter(
105158
options,
159+
blob_callback=lfs_filter.blob_callback,
106160
filename_callback=filename_filter,
107161
commit_callback=commit_callback,
108162
)
109163

110164
repo_filter.run()
165+
lfs_filter.log_summary()
111166
return 0
112167

113168
if __name__ == "__main__":

0 commit comments

Comments
 (0)