diff --git a/Cargo.toml b/Cargo.toml index 606ba8e16..c255f2ea3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,6 +65,7 @@ regex = "1.10.6" serde_json = "1.0.125" etcetera = "0.8.0" table_formatter = "0.6.1" +gix-attributes = "0.28.1" [dependencies.env_logger] optional = true diff --git a/README.md b/README.md index 018749a8f..c0f541bf9 100644 --- a/README.md +++ b/README.md @@ -251,11 +251,13 @@ FLAGS: --hidden Count hidden files. -l, --languages Prints out supported languages and their extensions. --no-ignore Don't respect ignore files (.gitignore, .ignore, etc.). This implies --no-ignore-parent, - --no-ignore-dot, and --no-ignore-vcs. + --no-ignore-dot, --no-ignore-vcs, and --no-ignore-linguist. --no-ignore-dot Don't respect .ignore and .tokeignore files, including those in parent directories. --no-ignore-parent Don't respect ignore files (.gitignore, .ignore, etc.) in parent directories. --no-ignore-vcs Don't respect VCS ignore files (.gitignore, .hgignore, etc.), including those in parent directories. + --no-ignore-linguist Don't respect linguist-vendored, linguist-generated, and linguist-documentation statements + in .gitattributes files. -V, --version Prints version information -v, --verbose Set log output level: 1: to show unknown file extensions, diff --git a/fuzz/fuzz_targets/parse_from_slice.rs b/fuzz/fuzz_targets/parse_from_slice.rs index c6d43745c..adb5de495 100644 --- a/fuzz/fuzz_targets/parse_from_slice.rs +++ b/fuzz/fuzz_targets/parse_from_slice.rs @@ -23,6 +23,7 @@ pub fn parse_from_slice(input: FuzzInput, check_total: bool) { no_ignore_parent: None, no_ignore_dot: None, no_ignore_vcs: None, + no_ignore_linguist: None, sort: None, types: None, for_each_fn: None, diff --git a/src/cli.rs b/src/cli.rs index 5243cb032..e7620fc10 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -44,6 +44,7 @@ pub struct Cli { pub no_ignore_parent: bool, pub no_ignore_dot: bool, pub no_ignore_vcs: bool, + pub no_ignore_linguist: bool, pub output: Option, pub streaming: Option, pub print_languages: bool, @@ -154,6 +155,15 @@ impl Cli { those in parent directories.\ ", )) + .arg(Arg::new("no_ignore_linguist") + .long("no-ignore-linguist") + .action(ArgAction::SetTrue) + .help( + "\ + Don't respect linguist-vendored, linguist-generated, and linguist-documentation \ + statements in .gitattributes files, including those in parent directories \ + " + )) .arg( Arg::new("output") .long("output") @@ -242,6 +252,7 @@ impl Cli { let no_ignore_parent = matches.get_flag("no_ignore_parent"); let no_ignore_dot = matches.get_flag("no_ignore_dot"); let no_ignore_vcs = matches.get_flag("no_ignore_vcs"); + let no_ignore_linguist = matches.get_flag("no_ignore_linguist"); let print_languages = matches.get_flag("languages"); let verbose = matches.get_count("verbose") as u64; let compact = matches.get_flag("compact"); @@ -304,6 +315,7 @@ impl Cli { no_ignore_parent, no_ignore_dot, no_ignore_vcs, + no_ignore_linguist, output, streaming, print_languages, @@ -408,6 +420,7 @@ impl Cli { /// * `no_ignore_parent` /// * `no_ignore_dot` /// * `no_ignore_vcs` + /// * `no_ignore_linguist` /// * `types` pub fn override_config(&mut self, mut config: Config) -> Config { config.hidden = if self.hidden { @@ -440,6 +453,12 @@ impl Cli { config.no_ignore_vcs }; + config.no_ignore_linguist = if self.no_ignore_linguist { + Some(true) + } else { + config.no_ignore_linguist + }; + config.for_each_fn = match self.streaming { Some(Streaming::Json) => Some(|l: LanguageType, e| { println!("{}", serde_json::json!({"language": l.name(), "stats": e})); diff --git a/src/config.rs b/src/config.rs index 82dcd8a0c..25a14006e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -38,6 +38,9 @@ pub struct Config { /// Don't respect VCS ignore files (.gitignore, .hgignore, etc.), including those in /// parent directories. *Default:* `false`. pub no_ignore_vcs: Option, + /// Don't respect linguist-vendored, linguist-generated, and linguist-documentation statements, + /// including those in parent directories. *Default:* `false` + pub no_ignore_linguist: Option, /// Whether to treat doc strings in languages as comments. *Default:* /// `false`. pub treat_doc_strings_as_comments: Option, @@ -135,6 +138,9 @@ impl Config { no_ignore_vcs: current_dir .no_ignore_vcs .or(home_dir.no_ignore_vcs.or(conf_dir.no_ignore_vcs)), + no_ignore_linguist: current_dir + .no_ignore_linguist + .or(home_dir.no_ignore_linguist.or(conf_dir.no_ignore_linguist)), } } } diff --git a/src/utils/fs.rs b/src/utils/fs.rs index 966115f5f..4357c1f29 100644 --- a/src/utils/fs.rs +++ b/src/utils/fs.rs @@ -1,4 +1,9 @@ -use std::{collections::BTreeMap, path::Path}; +use std::{ + collections::BTreeMap, + path::{Path, PathBuf}, +}; + +use gix_attributes::{self, parse::Kind}; use ignore::{overrides::OverrideBuilder, DirEntry, WalkBuilder, WalkState::Continue}; @@ -10,6 +15,12 @@ use crate::{ }; const IGNORE_FILE: &str = ".tokeignore"; +const GITATTRIBUTES: &str = ".gitattributes"; +const LINGUIST_IGNORES: &[&str] = &[ + "linguist-vendored", + "linguist-generated", + "linguist-documentation", +]; pub fn get_all_files>( paths: &[A], @@ -20,26 +31,36 @@ pub fn get_all_files>( let languages = parking_lot::Mutex::new(languages); let (tx, rx) = crossbeam_channel::unbounded(); - let mut paths = paths.iter(); - let mut walker = WalkBuilder::new(paths.next().unwrap()); + let mut paths_iter = paths.iter(); + let mut walker = WalkBuilder::new(paths_iter.next().unwrap()); - for path in paths { + for path in paths_iter { walker.add(path); } - if !ignored_directories.is_empty() { - let mut overrides = OverrideBuilder::new("."); + let ignore = config.no_ignore.map(|b| !b).unwrap_or(true); + let ignore_dot = ignore && config.no_ignore_dot.map(|b| !b).unwrap_or(true); + let ignore_parent = ignore && config.no_ignore_parent.map(|b| !b).unwrap_or(true); + let ignore_vcs = ignore && config.no_ignore_vcs.map(|b| !b).unwrap_or(true); + let ignore_linguist = ignore && config.no_ignore_linguist.map(|b| !b).unwrap_or(true); + let mut overrides = OverrideBuilder::new("."); + if !ignored_directories.is_empty() { for ignored in ignored_directories { - rs_error!(overrides.add(&format!("!{}", ignored))); + rs_error!(overrides.add(&flip_rule(ignored))); } - - walker.overrides(overrides.build().expect("Excludes provided were invalid")); } - - let ignore = config.no_ignore.map(|b| !b).unwrap_or(true); - let ignore_dot = ignore && config.no_ignore_dot.map(|b| !b).unwrap_or(true); - let ignore_vcs = ignore && config.no_ignore_vcs.map(|b| !b).unwrap_or(true); + if ignore_linguist { + get_linguist_overrides(&mut overrides, paths, ignore_parent); + } + match overrides.build() { + Ok(overrides) => { + walker.overrides(overrides); + } + Err(err) => { + error!("Error reading overrides: {err}"); + } + }; // Custom ignore files always work even if the `ignore` option is false, // so we only add if that option is not present. @@ -53,7 +74,7 @@ pub fn get_all_files>( .git_ignore(ignore_vcs) .hidden(config.hidden.map(|b| !b).unwrap_or(true)) .ignore(ignore_dot) - .parents(ignore && config.no_ignore_parent.map(|b| !b).unwrap_or(true)); + .parents(ignore_parent); walker.build_parallel().run(move || { let tx = tx.clone(); @@ -116,6 +137,41 @@ pub fn get_all_files>( } } +pub(crate) fn get_linguist_overrides>( + overrides: &mut OverrideBuilder, + paths: &[A], + ignore_parent: bool, +) { + let gitattribute_files: Vec = paths + .iter() + .flat_map(|path| { + if ignore_parent { + vec![path.as_ref()] + } else { + path.as_ref().ancestors().collect::>() + } + }) + .map(|dir| dir.join(GITATTRIBUTES)) + .filter(|candidate| candidate.exists()) + .collect(); + + for file in gitattribute_files { + let content = rs_error!(std::fs::read(&file)); + for assignment in gix_attributes::parse(&content) { + let (kind, attributes, __line_number) = rs_error!(assignment); + if attributes.filter_map(Result::ok).any(|attr| { + LINGUIST_IGNORES + .iter() + .any(|lin| *lin == attr.name.as_str()) + }) { + if let Kind::Pattern(pattern) = kind { + rs_error!(overrides.add(&flip_rule(rs_error!(str::from_utf8(&pattern.text))))); + } + } + } + } +} + pub(crate) fn get_extension(path: &Path) -> Option { path.extension().map(|e| e.to_string_lossy().to_lowercase()) } @@ -124,13 +180,19 @@ pub(crate) fn get_filename(path: &Path) -> Option { path.file_name().map(|e| e.to_string_lossy().to_lowercase()) } +pub(crate) fn flip_rule(rule: &str) -> String { + rule.strip_prefix('!') + .map(|x| x.to_owned()) + .unwrap_or(format!("!{}", rule)) +} + #[cfg(test)] mod tests { use std::fs; use tempfile::TempDir; - use super::IGNORE_FILE; + use super::{GITATTRIBUTES, IGNORE_FILE}; use crate::{ config::Config, language::{languages::Languages, LanguageType}, @@ -452,6 +514,41 @@ mod tests { assert!(languages.get(LANGUAGE).is_some()); } + #[test] + fn no_ignore_linguist() { + let dir = TempDir::new().expect("Couldn't create temp dir."); + let mut config = Config::default(); + let mut languages = Languages::new(); + + fs::write( + dir.path().join(GITATTRIBUTES), + format!("{} linguist-generated", IGNORE_PATTERN), + ) + .unwrap(); + fs::write(dir.path().join(FILE_NAME), FILE_CONTENTS).unwrap(); + + super::get_all_files( + &[dir.path().to_str().unwrap()], + &[], + &mut languages, + &config, + ); + dbg!(config.no_ignore_linguist); + + assert!(languages.get(LANGUAGE).is_none()); + + config.no_ignore_linguist = Some(true); + + super::get_all_files( + &[dir.path().to_str().unwrap()], + &[], + &mut languages, + &config, + ); + + assert!(languages.get(LANGUAGE).is_some()); + } + #[test] fn custom_ignore() { let dir = TempDir::new().expect("Couldn't create temp dir.");