diff --git a/Cargo.lock b/Cargo.lock index bb69d9549..502799b8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -302,6 +302,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +dependencies = [ + "memchr", +] + [[package]] name = "dashmap" version = "6.0.1" @@ -1354,13 +1375,14 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokei" -version = "13.0.0-alpha.9" +version = "13.0.0" dependencies = [ "aho-corasick", "arbitrary", "clap", "colored", "crossbeam-channel", + "csv", "dashmap", "encoding_rs_io", "env_logger", diff --git a/Cargo.toml b/Cargo.toml index 36efc3728..f04166bb6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,10 +23,11 @@ rust-version = "1.71" edition = "2021" [features] -all = ["cbor", "yaml"] +all = ["cbor", "yaml", "csv"] cbor = ["dep:hex", "dep:serde_cbor"] default = [] yaml = ["dep:serde_yaml"] +csv = ["dep:csv"] [profile.release] lto = "thin" @@ -77,6 +78,10 @@ version = "0.11.2" optional = true version = "0.9.34" +[dependencies.csv] +optional = true +version = "1.3.1" + [dev-dependencies] proptest = "1.5.0" strum = "0.26.3" diff --git a/README.md b/README.md index 018749a8f..aeb4a299f 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ Tokei is a program that displays statistics about your code. Tokei will show the - Tokei has huge range of languages, supporting over **150** languages, and their various extensions. -- Tokei can output in multiple formats(**CBOR**, **JSON**, **YAML**) +- Tokei can output in multiple formats(**CBOR**, **JSON**, **YAML**, **CSV**) allowing Tokei's output to be easily stored, and reused. These can also be reused in tokei combining a previous run's statistics with another set. @@ -219,12 +219,16 @@ tokei with the features flag. YAML: cargo install tokei --features yaml + + CSV: + cargo install tokei --features csv ``` **Currently supported formats** - JSON `--output json` - YAML `--output yaml` - CBOR `--output cbor` +- CSV `--output csv` ```shell $ tokei ./foo --output json @@ -268,7 +272,7 @@ OPTIONS: -i, --input Gives statistics from a previous tokei run. Can be given a file path, or "stdin" to read from stdin. -o, --output Outputs Tokei in a specific format. Compile with additional features for more format - support. [possible values: cbor, json, yaml] + support. [possible values: cbor, json, yaml, csv] -s, --sort Sort languages based on column [possible values: files, lines, blanks, code, comments] -t, --type Filters output by language type, separated by a comma. i.e. -t=Rust,Markdown diff --git a/src/cli.rs b/src/cli.rs index 5243cb032..a3422de08 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -319,7 +319,7 @@ impl Cli { cli } - pub fn file_input(&self) -> Option<&str> { + pub fn file_input(&self) -> Option { self.matches.get_one("file_input").cloned() } diff --git a/src/input.rs b/src/input.rs index 1460a9696..544882437 100644 --- a/src/input.rs +++ b/src/input.rs @@ -162,6 +162,10 @@ supported_formats!( (yaml, "yaml", Yaml [serde_yaml]) => serde_yaml::from_str, serde_yaml::to_string, + + (csv, "csv", Csv [csv]) => + serialize_csv::from_str, + serialize_csv::to_string, ); pub fn add_input(input: &str, languages: &mut Languages) -> bool { @@ -204,6 +208,271 @@ fn convert_input(contents: &str) -> Option { self::Format::parse(contents) } +#[cfg(feature = "csv")] +mod serialize_csv { + //! CSV serialization + //! + //! Linearizes hierarchical blob structures into flat CSV format. + //! + //! Files contain Reports with CodeStats that have nested blobs: + //! + //! ``` + //! README.md (Markdown) + //! └─ Rust (code block) + //! └─ Markdown (comment nested within Rust) + //! ``` + //! + //! are flattened using `nested` column: + //! + //! | File | Language | Nested | Lines | Code | Comments | Blanks | + //! |-----------|----------|-----------------|-------|------|----------|--------| + //! | README.md | Markdown | "" | 100 | 80 | 15 | 5 | + //! | README.md | Markdown | "Rust" | 50 | 45 | 3 | 2 | + //! | README.md | Markdown | "Rust,Markdown" | 20 | 18 | 1 | 1 | + //! + //! using depth-first traversal of the CodeStats blob tree. + + use std::collections::hash_map::Entry; + use std::collections::BTreeMap; + use std::collections::HashMap; + use std::error::Error; + use std::path::PathBuf; + + use super::LanguageMap; + use super::Output; + use serde::Deserialize; + use serde::Deserializer; + use serde::Serialize; + use tokei::CodeStats; + use tokei::Language; + use tokei::LanguageType; + use tokei::Report; + + /// CSV record for language statistics. + /// + /// Represents either: + /// - Primary file stats (nested = empty) + /// - Nested blob stats (nested = path to blob) + #[derive(Debug, Serialize, Deserialize)] + #[serde(rename_all = "PascalCase")] + struct Record { + file: PathBuf, + /// File's primary language (constant for all blobs) + language: LanguageType, + /// Comma-separated nested path (e.g., "Rust,Markdown") + #[serde( + serialize_with = "Record::serialize_nested_langs", + deserialize_with = "Record::deserialize_nested_langs" + )] + nested: Vec, + lines: usize, + code: usize, + comments: usize, + blanks: usize, + /// Accuracy flag + inaccurate: bool, + } + + impl Record { + fn new( + file: PathBuf, + language: LanguageType, + nested: Vec, + inaccurate: bool, + stats: &CodeStats, + ) -> Self { + Self { + file, + language, + nested, + lines: stats.lines(), + code: stats.code, + comments: stats.comments, + blanks: stats.blanks, + inaccurate, + } + } + + fn serialize_nested_langs( + nested: &[LanguageType], + serializer: S, + ) -> Result + where + S: serde::Serializer, + { + if nested.is_empty() { + return serializer.serialize_str(""); + } + let s = nested + .iter() + .map(LanguageType::to_string) + .collect::>() + .join(","); + serializer.serialize_str(&s) + } + + fn deserialize_nested_langs<'de, D>(deserializer: D) -> Result, D::Error> + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + let s = s.trim(); + + if s.is_empty() { + return Ok(Vec::new()); + } + + s.split(',') + .map(|x| { + LanguageType::from_name(x.trim()) + .ok_or_else(|| serde::de::Error::custom(format!("Unknown language {x}"))) + }) + .collect() + } + + fn to_code_stats(&self) -> CodeStats { + let mut cs = CodeStats::new(); + + cs.blanks = self.blanks; + cs.code = self.code; + cs.comments = self.comments; + + cs + } + + fn to_report(&self) -> Report { + let mut report = Report::new(self.file.clone()); + report.stats += self.to_code_stats(); + report + } + } + + /// Recursively serializes blob tree to CSV records. + /// + /// Depth-first traversal maintaining current path in `nested` vector. + fn serialize_blobs( + csv: &mut csv::Writer>, + file: &PathBuf, + primary: LanguageType, + nested: &mut Vec, + blobs: &BTreeMap, + ) -> Result<(), Box> { + for (lang_type, stats) in blobs { + nested.push(*lang_type); + + csv.serialize(Record::new( + file.clone(), + primary, + nested.clone(), + false, + stats, + ))?; + + serialize_blobs(csv, file, primary, nested, &stats.blobs)?; + nested.pop(); + } + + Ok(()) + } + + pub(super) fn to_string(output: &Output) -> Result> { + let mut csv = csv::Writer::from_writer(vec![]); + + for (lang_type, lang) in &output.languages { + for report in &lang.reports { + csv.serialize(Record::new( + report.name.clone(), + *lang_type, + Vec::new(), + lang.inaccurate, + &report.stats, + ))?; + let mut nested = Vec::new(); + serialize_blobs( + &mut csv, + &report.name, + *lang_type, + &mut nested, + &report.stats.blobs, + )?; + } + } + + Ok(String::from_utf8(csv.into_inner()?)?) + } + + /// Parses CSV string into Output structure + /// + /// Reconstructs hierarchical blob structure from linearized CSV. + /// + /// Steps: + /// 1. Parse CSV records, group by file + /// 2. Use `nested` field as navigation path to rebuild blob tree + /// 3. Sort by original order, aggregate statistics + /// + /// Example (simplified): + /// + /// | File | Language | Nested | Lines | + /// |-----------|----------|-----------------|-------| + /// | README.md | Markdown | "" | 100 | + /// | README.md | Markdown | "Rust" | 50 | + /// | README.md | Markdown | "Rust,Markdown" | 20 | + /// + /// becomes: + /// + /// ``` + /// README.md (Markdown) { + /// stats: 100 lines + /// blobs: { + /// Rust: { + /// stats: 50 lines + /// blobs: { Markdown: { 20 lines } } + /// } + /// } + /// } + /// ``` + pub(super) fn from_str(s: &str) -> Result> { + let mut csv = csv::Reader::from_reader(s.as_bytes()); + let mut files: HashMap = HashMap::new(); + + // Parse CSV records and group by file + for (idx, record) in csv.deserialize::().enumerate() { + let record = record?; + match files.entry(record.file.clone()) { + Entry::Occupied(mut entry) => { + let (_, _, report) = entry.get_mut(); + + // Navigate blob tree path, create missing nodes + let stats = record.nested.iter().fold(&mut report.stats, |stats, lang| { + stats.blobs.entry(*lang).or_default() + }); + *stats += record.to_code_stats(); + } + Entry::Vacant(entry) => { + entry.insert((idx, record.language, record.to_report())); + } + } + } + + let mut languages = LanguageMap::new(); + let mut totals = Language::new(); + + // Sort by original order and aggregate + let mut sorted_files: Vec<_> = files.into_values().collect(); + sorted_files.sort_unstable_by_key(|(idx, _, _)| *idx); + + for (_, lang_type, report) in sorted_files { + totals.add_report(report.clone()); + languages.entry(lang_type).or_default().add_report(report); + } + + languages.values_mut().for_each(Language::total); + totals.total(); + + Ok(Output { languages, totals }) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/main.rs b/src/main.rs index 62619dfe6..2615bb72d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -30,7 +30,7 @@ fn main() -> Result<(), Box> { let config = cli.override_config(Config::from_config_files()); let mut languages = Languages::new(); - if let Some(input) = cli.file_input() { + if let Some(input) = cli.file_input().as_ref() { if !add_input(input, &mut languages) { Cli::print_input_parse_failure(input); process::exit(1);