diff --git a/Cargo.lock b/Cargo.lock index 7db835d93..5d6c267f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,6 +75,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arbitrary" version = "1.3.2" @@ -96,6 +102,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bit-set" version = "0.5.3" @@ -133,6 +145,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", + "regex-automata", "serde", ] @@ -418,6 +431,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "fancy-regex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.1.0" @@ -1060,6 +1084,12 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.38.34" @@ -1337,6 +1367,21 @@ dependencies = [ "syn", ] +[[package]] +name = "tiktoken-rs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25563eeba904d770acf527e8b370fe9a5547bacd20ff84a0b6c3bc41288e5625" +dependencies = [ + "anyhow", + "base64", + "bstr", + "fancy-regex", + "lazy_static", + "regex", + "rustc-hash", +] + [[package]] name = "tinyvec" version = "1.8.0" @@ -1387,6 +1432,7 @@ dependencies = [ "tempfile", "tera", "term_size", + "tiktoken-rs", "toml", ] diff --git a/Cargo.toml b/Cargo.toml index 6cd40515f..b63719200 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,6 +60,7 @@ regex = "1.10.6" serde_json = "1.0.125" etcetera = "0.8.0" table_formatter = "0.6.1" +tiktoken-rs = "0.7.0" [dependencies.env_logger] features = [] diff --git a/src/cli.rs b/src/cli.rs index 5243cb032..5093f46ad 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -8,7 +8,7 @@ use crate::{ cli_utils::{crate_version, parse_or_exit, NumberFormatStyle}, consts::{ BLANKS_COLUMN_WIDTH, CODE_COLUMN_WIDTH, COMMENTS_COLUMN_WIDTH, LANGUAGE_COLUMN_WIDTH, - LINES_COLUMN_WIDTH, PATH_COLUMN_WIDTH, + LINES_COLUMN_WIDTH, PATH_COLUMN_WIDTH, TOKENS_COLUMN_WIDTH, }, input::Format, }; @@ -446,13 +446,14 @@ impl Cli { }), Some(Streaming::Simple) => Some(|l: LanguageType, e| { println!( - "{:>LANGUAGE_COLUMN_WIDTH$} {:LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + "{:>LANGUAGE_COLUMN_WIDTH$} {:LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", l.name(), e.name.to_string_lossy().to_string(), e.stats.lines(), e.stats.code, e.stats.comments, - e.stats.blanks + e.stats.blanks, + e.stats.tokens, ); }), _ => None, diff --git a/src/cli_utils.rs b/src/cli_utils.rs index 9bedce940..ca592699d 100644 --- a/src/cli_utils.rs +++ b/src/cli_utils.rs @@ -15,12 +15,12 @@ use tokei::{find_char_boundary, CodeStats, Language, LanguageType, Report}; use crate::consts::{ BLANKS_COLUMN_WIDTH, CODE_COLUMN_WIDTH, COMMENTS_COLUMN_WIDTH, FILES_COLUMN_WIDTH, - LINES_COLUMN_WIDTH, + LINES_COLUMN_WIDTH, TOKENS_COLUMN_WIDTH, }; -const NO_LANG_HEADER_ROW_LEN: usize = 69; -const NO_LANG_ROW_LEN: usize = 63; -const NO_LANG_ROW_LEN_NO_SPACES: usize = 56; +const NO_LANG_HEADER_ROW_LEN: usize = 69 + 13; +const NO_LANG_ROW_LEN: usize = 63 + 13; +const NO_LANG_ROW_LEN_NO_SPACES: usize = 56 + 13; const IDENT_INACCURATE: &str = "(!)"; pub fn crate_version() -> String { @@ -155,13 +155,14 @@ impl Printer { let files_column_width: usize = FILES_COLUMN_WIDTH + 6; writeln!( self.writer, - " {:<6$} {:>files_column_width$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + " {:<7$} {:>files_column_width$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", "Language".bold().blue(), "Files".bold().blue(), "Lines".bold().blue(), "Code".bold().blue(), "Comments".bold().blue(), "Blanks".bold().blue(), + "Tokens".bold().blue(), self.columns - NO_LANG_HEADER_ROW_LEN )?; self.print_row() @@ -183,7 +184,7 @@ impl Printer { write!(self.writer, " ")?; writeln!( self.writer, - "{:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + "{:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", language .reports .len() @@ -192,6 +193,7 @@ impl Printer { language.code.to_formatted_string(&self.number_format), language.comments.to_formatted_string(&self.number_format), language.blanks.to_formatted_string(&self.number_format), + language.tokens.to_formatted_string(&self.number_format), ) } @@ -203,7 +205,7 @@ impl Printer { write!(self.writer, " ")?; writeln!( self.writer, - "{:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + "{:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", language .children .values() @@ -227,6 +229,7 @@ impl Printer { .blanks .to_formatted_string(&self.number_format) .blue(), + language.tokens.to_formatted_string(&self.number_format).blue(), ) } @@ -284,12 +287,13 @@ impl Printer { } else { writeln!( self.writer, - " {:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + " {:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", stats.len().to_formatted_string(&self.number_format), (code + comments + blanks).to_formatted_string(&self.number_format), code.to_formatted_string(&self.number_format), comments.to_formatted_string(&self.number_format), blanks.to_formatted_string(&self.number_format), + stats.iter().map(|s| s.tokens).sum::().to_formatted_string(&self.number_format), ) } } @@ -414,12 +418,13 @@ impl Printer { writeln!( self.writer, - " {:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + " {:>FILES_COLUMN_WIDTH$} {:>LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", " ", stats.lines().to_formatted_string(&self.number_format), stats.code.to_formatted_string(&self.number_format), stats.comments.to_formatted_string(&self.number_format), stats.blanks.to_formatted_string(&self.number_format), + stats.tokens.to_formatted_string(&self.number_format), ) } @@ -467,7 +472,7 @@ impl Printer { let lines_column_width: usize = FILES_COLUMN_WIDTH + 6; writeln!( self.writer, - " {: lines_column_width$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + " {: lines_column_width$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", name, report .stats @@ -479,6 +484,7 @@ impl Printer { .comments .to_formatted_string(&self.number_format), report.stats.blanks.to_formatted_string(&self.number_format), + report.stats.tokens.to_formatted_string(&self.number_format), max = max_len ) } diff --git a/src/consts.rs b/src/consts.rs index ce451dacd..4df5bd86e 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,7 +1,7 @@ // Set of common pub consts. /// Fallback row length -pub const FALLBACK_ROW_LEN: usize = 81; +pub const FALLBACK_ROW_LEN: usize = 81 + 13; // Column widths used for console printing. @@ -25,3 +25,6 @@ pub const COMMENTS_COLUMN_WIDTH: usize = 12; /// Blanks column width pub const BLANKS_COLUMN_WIDTH: usize = 12; + +/// Tokens column width +pub const TOKENS_COLUMN_WIDTH: usize = 12; diff --git a/src/language/language_type.rs b/src/language/language_type.rs index bffdf7a87..25d0b702c 100644 --- a/src/language/language_type.rs +++ b/src/language/language_type.rs @@ -108,24 +108,26 @@ impl LanguageType { // first character in the column, so removing starting whitespace // could cause a miscount. let line = if is_fortran { line } else { line.trim() }; + let tokens = crate::tokens::count_tokens_from_bytes(line); if line.trim().is_empty() { - (1, 0, 0) + (1, 0, 0, tokens) } else if is_literate || comments.iter().any(|c| line.starts_with(c.as_bytes())) { - (0, 0, 1) + (0, 0, 1, tokens) } else { - (0, 1, 0) + (0, 1, 0, tokens) } }) - .reduce(|| (0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2)) + .reduce(|| (0, 0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2, a.3 + b.3)) }; - let (mut stats, (blanks, code, comments)) = rayon::join(parse_lines, simple_parse); + let (mut stats, (blanks, code, comments, tokens)) = rayon::join(parse_lines, simple_parse); stats.blanks += blanks; stats.code += code; stats.comments += comments; + stats.tokens += tokens; stats } else { self.parse_lines(config, text, CodeStats::new(), syntax) @@ -211,6 +213,9 @@ impl LanguageType { } } + let tokens = crate::tokens::count_tokens_from_bytes(lines); + stats.tokens += tokens; + stats } diff --git a/src/language/languages.rs b/src/language/languages.rs index 2b0fc5845..9165ebcc8 100644 --- a/src/language/languages.rs +++ b/src/language/languages.rs @@ -107,6 +107,7 @@ impl Languages { total.comments += language.comments; total.blanks += language.blanks; total.code += language.code; + total.tokens += language.tokens; total.inaccurate |= language.inaccurate; total.children.insert(*ty, language.reports.clone()); } diff --git a/src/language/mod.rs b/src/language/mod.rs index f6ffa06ca..8a5d849a4 100644 --- a/src/language/mod.rs +++ b/src/language/mod.rs @@ -18,6 +18,8 @@ pub struct Language { pub code: usize, /// The total number of comments(both single, and multi-line) pub comments: usize, + /// The total number of tokens. + pub tokens: usize, /// A collection of statistics of individual files. pub reports: Vec, /// A map of any languages found in the reports. @@ -77,6 +79,7 @@ impl Language { summary.comments += stats.comments; summary.code += stats.code; summary.blanks += stats.blanks; + summary.tokens += stats.tokens; } } @@ -104,16 +107,19 @@ impl Language { let mut blanks = 0; let mut code = 0; let mut comments = 0; + let mut tokens = 0; for report in &self.reports { blanks += report.stats.blanks; code += report.stats.code; comments += report.stats.comments; + tokens += report.stats.tokens; } self.blanks = blanks; self.code = code; self.comments = comments; + self.tokens = tokens; } /// Checks if the language is empty. Empty meaning it doesn't have any diff --git a/src/lib.rs b/src/lib.rs index a2b4d0e66..3cf4e1223 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,6 +54,7 @@ mod consts; mod language; mod sort; mod stats; +mod tokens; pub use self::{ config::Config, diff --git a/src/main.rs b/src/main.rs index 62619dfe6..fbb549de8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ use crate::{ cli_utils::Printer, consts::{ BLANKS_COLUMN_WIDTH, CODE_COLUMN_WIDTH, COMMENTS_COLUMN_WIDTH, FALLBACK_ROW_LEN, - LANGUAGE_COLUMN_WIDTH, LINES_COLUMN_WIDTH, PATH_COLUMN_WIDTH, + LANGUAGE_COLUMN_WIDTH, LINES_COLUMN_WIDTH, PATH_COLUMN_WIDTH, TOKENS_COLUMN_WIDTH, }, input::add_input, }; @@ -61,17 +61,18 @@ fn main() -> Result<(), Box> { if cli.streaming == Some(crate::cli::Streaming::Simple) { println!( - "#{:^LANGUAGE_COLUMN_WIDTH$} {:^PATH_COLUMN_WIDTH$} {:^LINES_COLUMN_WIDTH$} {:^CODE_COLUMN_WIDTH$} {:^COMMENTS_COLUMN_WIDTH$} {:^BLANKS_COLUMN_WIDTH$}", - "language", "path", "lines", "code", "comments", "blanks" + "#{:^LANGUAGE_COLUMN_WIDTH$} {:^PATH_COLUMN_WIDTH$} {:^LINES_COLUMN_WIDTH$} {:^CODE_COLUMN_WIDTH$} {:^COMMENTS_COLUMN_WIDTH$} {:^BLANKS_COLUMN_WIDTH$} {:^TOKENS_COLUMN_WIDTH$}", + "language", "path", "lines", "code", "comments", "blanks", "tokens" ); println!( - "{:>LANGUAGE_COLUMN_WIDTH$} {:LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + "{:>LANGUAGE_COLUMN_WIDTH$} {:LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", (0..10).map(|_| "#").collect::(), (0..80).map(|_| "#").collect::(), (0..12).map(|_| "#").collect::(), (0..12).map(|_| "#").collect::(), (0..12).map(|_| "#").collect::(), - (0..12).map(|_| "#").collect::() + (0..12).map(|_| "#").collect::(), + (0..12).map(|_| "#").collect::(), ); } diff --git a/src/stats.rs b/src/stats.rs index 5562e65e7..2d36ebf89 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -1,5 +1,5 @@ use crate::consts::{ - BLANKS_COLUMN_WIDTH, CODE_COLUMN_WIDTH, COMMENTS_COLUMN_WIDTH, LINES_COLUMN_WIDTH, + BLANKS_COLUMN_WIDTH, CODE_COLUMN_WIDTH, COMMENTS_COLUMN_WIDTH, LINES_COLUMN_WIDTH, TOKENS_COLUMN_WIDTH, }; use crate::LanguageType; use std::{collections::BTreeMap, fmt, ops, path::PathBuf}; @@ -16,6 +16,9 @@ pub struct CodeStats { pub comments: usize, /// Language blobs that were contained inside this blob. pub blobs: BTreeMap, + + /// The token count of in the blob. + pub tokens: usize, } impl CodeStats { @@ -60,6 +63,7 @@ impl ops::AddAssign<&'_ CodeStats> for CodeStats { self.blanks += rhs.blanks; self.code += rhs.code; self.comments += rhs.comments; + self.tokens += rhs.tokens; for (language, stats) in &rhs.blobs { *self.blobs.entry(*language).or_default() += stats; @@ -111,12 +115,13 @@ macro_rules! display_stats { ($f:expr, $this:expr, $name:expr, $max:expr) => { write!( $f, - " {: LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$}", + " {: LINES_COLUMN_WIDTH$} {:>CODE_COLUMN_WIDTH$} {:>COMMENTS_COLUMN_WIDTH$} {:>BLANKS_COLUMN_WIDTH$} {:>TOKENS_COLUMN_WIDTH$}", $name, $this.stats.lines(), $this.stats.code, $this.stats.comments, $this.stats.blanks, + $this.stats.tokens, max = $max ) }; diff --git a/src/tokens.rs b/src/tokens.rs new file mode 100644 index 000000000..b7c83fcb0 --- /dev/null +++ b/src/tokens.rs @@ -0,0 +1,18 @@ +use once_cell::sync::Lazy; +use tiktoken_rs::CoreBPE; + +static TOKENIZER: Lazy = Lazy::new(|| tiktoken_rs::p50k_base().unwrap()); + +pub fn count_tokens(text: &str) -> usize { + TOKENIZER.encode_with_special_tokens(text).len() +} + +pub fn count_tokens_from_bytes(bytes: &[u8]) -> usize { + match std::str::from_utf8(bytes) { + Ok(text) => count_tokens(text), + Err(_) => { + let text = String::from_utf8_lossy(bytes); + count_tokens(&text) + } + } +}