diff --git a/Cargo.lock b/Cargo.lock index b9f070b0fa6..631eab0d085 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -345,6 +345,12 @@ dependencies = [ "derive_arbitrary", ] +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "assert-json-diff" version = "2.0.2" @@ -1103,6 +1109,28 @@ dependencies = [ "windows-link", ] +[[package]] +name = "chrono-tz" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" +dependencies = [ + "parse-zoneinfo", + "phf", + "phf_codegen", +] + [[package]] name = "chumsky" version = "0.9.3" @@ -1225,6 +1253,16 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "colored" version = "3.0.0" @@ -1352,7 +1390,7 @@ dependencies = [ "chrono", "claims", "clap", - "colored", + "colored 3.0.0", "cookie", "crates_io_cdn_logs", "crates_io_database", @@ -1583,6 +1621,16 @@ dependencies = [ "url", ] +[[package]] +name = "crates_io_linecount" +version = "0.0.0" +dependencies = [ + "claims", + "insta", + "serde", + "tokei", +] + [[package]] name = "crates_io_markdown" version = "0.0.0" @@ -1648,6 +1696,7 @@ dependencies = [ "cargo-manifest", "claims", "clap", + "crates_io_linecount", "flate2", "futures-util", "indicatif", @@ -1926,6 +1975,21 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", + "serde", +] + [[package]] name = "deadpool" version = "0.12.1" @@ -2306,12 +2370,44 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "entities" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca" +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -2328,6 +2424,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + [[package]] name = "event-listener" version = "5.4.0" @@ -2652,6 +2759,17 @@ dependencies = [ "regex-syntax 0.8.5", ] +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags", + "ignore", + "walkdir", +] + [[package]] name = "googletest" version = "0.14.2" @@ -2675,6 +2793,30 @@ dependencies = [ "syn", ] +[[package]] +name = "grep-matcher" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47a3141a10a43acfedc7c98a60a834d7ba00dfe7bec9071cbfc19b55b292ac02" +dependencies = [ + "memchr", +] + +[[package]] +name = "grep-searcher" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9b6c14b3fc2e0a107d6604d3231dec0509e691e62447104bc385a46a7892cda" +dependencies = [ + "bstr", + "encoding_rs", + "encoding_rs_io", + "grep-matcher", + "log", + "memchr", + "memmap2", +] + [[package]] name = "group" version = "0.13.0" @@ -2922,6 +3064,15 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "humansize" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cb51c9a029ddc91b07a787f1d86b53ccfa49b0e86688c946ebe8d3555685dd7" +dependencies = [ + "libm", +] + [[package]] name = "humantime" version = "2.1.0" @@ -3218,6 +3369,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.9", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "impl-more" version = "0.1.9" @@ -3344,6 +3511,30 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jiff" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3378,6 +3569,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "json5" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b0db21af676c1ce64250b5f40f3ce2cf27e4e47cb91ed91eb6fe9350b430c1" +dependencies = [ + "pest", + "pest_derive", + "serde", +] + [[package]] name = "jsonwebtoken" version = "9.3.1" @@ -3657,6 +3859,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + [[package]] name = "memo-map" version = "0.3.3" @@ -3772,7 +3983,7 @@ checksum = "7760e0e418d9b7e5777c0374009ca4c93861b9066f18cb334a20ce50ab63aa48" dependencies = [ "assert-json-diff", "bytes", - "colored", + "colored 3.0.0", "futures-util", "http 1.3.1", "http-body 1.0.1", @@ -3884,6 +4095,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -4109,6 +4330,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parse-zoneinfo" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" +dependencies = [ + "regex", +] + [[package]] name = "paste" version = "1.0.15" @@ -4331,6 +4561,15 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres-native-tls" version = "0.5.1" @@ -5565,6 +5804,17 @@ dependencies = [ "libc", ] +[[package]] +name = "table_formatter" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "beef5d3fd5472c911d41286849de6a9aee93327f7fae9fb9148fe9ff0102c17d" +dependencies = [ + "colored 2.2.0", + "itertools 0.11.0", + "thiserror 1.0.69", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -5606,6 +5856,38 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tera" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab9d851b45e865f178319da0abdbfe6acbc4328759ff18dafc3a41c16b4cd2ee" +dependencies = [ + "chrono", + "chrono-tz", + "globwalk", + "humansize", + "lazy_static", + "percent-encoding", + "pest", + "pest_derive", + "rand 0.8.5", + "regex", + "serde", + "serde_json", + "slug", + "unic-segment", +] + +[[package]] +name = "term_size" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "terminal_size" version = "0.4.1" @@ -5791,6 +6073,38 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokei" +version = "13.0.0-alpha.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb367822e854c96f275dd52aad070e445cf15f1521e25d2b1dedc1dd0b1f5be" +dependencies = [ + "aho-corasick", + "arbitrary", + "clap", + "colored 2.2.0", + "crossbeam-channel", + "dashmap", + "encoding_rs_io", + "env_logger", + "etcetera", + "grep-searcher", + "ignore", + "json5", + "log", + "num-format", + "once_cell", + "parking_lot", + "rayon", + "regex", + "serde", + "serde_json", + "table_formatter", + "tera", + "term_size", + "toml", +] + [[package]] name = "tokio" version = "1.45.1" @@ -6129,6 +6443,56 @@ dependencies = [ "libc", ] +[[package]] +name = "unic-char-property" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221" +dependencies = [ + "unic-char-range", +] + +[[package]] +name = "unic-char-range" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc" + +[[package]] +name = "unic-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" + +[[package]] +name = "unic-segment" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4ed5d26be57f84f176157270c112ef57b86debac9cd21daaabbe56db0f88f23" +dependencies = [ + "unic-ucd-segment", +] + +[[package]] +name = "unic-ucd-segment" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2079c122a62205b421f499da10f3ee0f7697f012f55b675e002483c73ea34700" +dependencies = [ + "unic-char-property", + "unic-char-range", + "unic-ucd-version", +] + +[[package]] +name = "unic-ucd-version" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4" +dependencies = [ + "unic-common", +] + [[package]] name = "unicase" version = "2.8.1" @@ -6696,6 +7060,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -6714,6 +7087,21 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -6746,6 +7134,12 @@ dependencies = [ "windows_x86_64_msvc 0.53.0", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -6758,6 +7152,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -6770,6 +7170,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -6794,6 +7200,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -6806,6 +7218,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -6818,6 +7236,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -6830,6 +7254,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" diff --git a/crates/crates_io_database/src/models/version.rs b/crates/crates_io_database/src/models/version.rs index 8ca1a2edfa2..4f91e3e17c9 100644 --- a/crates/crates_io_database/src/models/version.rs +++ b/crates/crates_io_database/src/models/version.rs @@ -36,6 +36,7 @@ pub struct Version { pub homepage: Option, pub documentation: Option, pub repository: Option, + pub linecounts: Option, } impl Version { @@ -103,6 +104,7 @@ pub struct NewVersion<'a> { repository: Option<&'a str>, categories: Option<&'a [&'a str]>, keywords: Option<&'a [&'a str]>, + linecounts: Option, } impl NewVersion<'_> { diff --git a/crates/crates_io_database/src/schema.rs b/crates/crates_io_database/src/schema.rs index 9d43048b2e1..b07e87e5111 100644 --- a/crates/crates_io_database/src/schema.rs +++ b/crates/crates_io_database/src/schema.rs @@ -1077,6 +1077,8 @@ diesel::table! { keywords -> Array>, /// JSONB representation of the version number for sorting purposes. semver_ord -> Nullable, + /// Source Lines of Code statistics for this version, stored as JSON with language breakdown and totals. + linecounts -> Nullable, } } diff --git a/crates/crates_io_database_dump/src/dump-db.toml b/crates/crates_io_database_dump/src/dump-db.toml index c3c28ca558e..4e792dd20f2 100644 --- a/crates/crates_io_database_dump/src/dump-db.toml +++ b/crates/crates_io_database_dump/src/dump-db.toml @@ -280,6 +280,8 @@ documentation = "public" repository = "public" categories = "public" keywords = "public" +# The following column is private for now, until we can guarantee a stable data schema. +linecounts = "private" [versions_published_by.columns] version_id = "private" diff --git a/crates/crates_io_linecount/Cargo.toml b/crates/crates_io_linecount/Cargo.toml new file mode 100644 index 00000000000..a4d69204238 --- /dev/null +++ b/crates/crates_io_linecount/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "crates_io_linecount" +version = "0.0.0" +description = "Lines of code counting for crates.io using tokei" +license = "MIT OR Apache-2.0" +edition = "2024" + +[lints] +workspace = true + +[dependencies] +serde = { version = "=1.0.219", features = ["derive"] } +tokei = "=13.0.0-alpha.8" + +[dev-dependencies] +claims = "=0.8.0" +insta = { version = "=1.43.1", features = ["json"] } diff --git a/crates/crates_io_linecount/src/lib.rs b/crates/crates_io_linecount/src/lib.rs new file mode 100644 index 00000000000..b569e1570b6 --- /dev/null +++ b/crates/crates_io_linecount/src/lib.rs @@ -0,0 +1,232 @@ +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::Path; +use std::sync::LazyLock; +use tokei::Config; + +// Re-export LanguageType for use by other crates +pub use tokei::LanguageType; + +/// Tokei configuration used for analysis (cached) +static TOKEI_CONFIG: LazyLock = LazyLock::new(|| Config { + no_ignore: Some(true), + treat_doc_strings_as_comments: Some(true), + ..Default::default() +}); + +/// Statistics for a single programming language +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct LanguageStats { + /// Number of lines of code (excluding comments and blank lines) + pub code_lines: usize, + /// Number of comment lines + pub comment_lines: usize, + /// Number of files of this language + pub files: usize, +} + +/// Complete line count statistics for a crate +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct LinecountStats { + /// Per-language breakdown of line counts + pub languages: HashMap, + /// Total lines of code across all languages + pub total_code_lines: usize, + /// Total comment lines across all languages + pub total_comment_lines: usize, +} + +impl LinecountStats { + /// Create a new empty statistics collection + pub fn new() -> Self { + Self::default() + } + + /// Add a single file to the statistics + /// + /// The caller can use `should_count_path()` to check if a file should be processed + /// before decompressing to avoid unnecessary work. + pub fn add_file(&mut self, language_type: LanguageType, content: &[u8]) { + let file_stats = language_type.parse_from_slice(content, &TOKEI_CONFIG); + + // Update language-specific stats + let entry = self.languages.entry(language_type).or_default(); + entry.code_lines += file_stats.code; + entry.comment_lines += file_stats.comments; + entry.files += 1; + + // Update totals + self.total_code_lines += file_stats.code; + self.total_comment_lines += file_stats.comments; + } +} + +/// Check if a path should be counted and return its language type +/// +/// Returns `Some(LanguageType)` if the file should be analyzed, `None` otherwise. +pub fn should_count_path(path: &Path) -> Option { + let path_str = path.to_string_lossy().to_lowercase(); + + // Skip test and example directories + if path_str.contains("tests/") + || path_str.contains("test/") + || path_str.contains("testing/") + || path_str.contains("examples/") + || path_str.contains("benches/") + || path_str.contains("benchmark/") + { + return None; + } + + // Skip hidden files + if let Some(filename) = path.file_name() { + if filename.to_string_lossy().starts_with('.') { + return None; + } + } + + // Get language type from file extension + let extension = path.extension().and_then(|ext| ext.to_str())?; + let language_type = LanguageType::from_file_extension(extension)?; + + // Only count if it's a programming language + is_countable_language(language_type).then_some(language_type) +} + +/// Determine if a language should be counted +fn is_countable_language(lang: LanguageType) -> bool { + !matches!( + lang, + // Configuration and data files + LanguageType::Json | + LanguageType::Yaml | + LanguageType::Toml | + LanguageType::Xml | + LanguageType::Ini | + + // Documentation + LanguageType::Markdown | + LanguageType::Text | + LanguageType::ReStructuredText | + LanguageType::AsciiDoc | + LanguageType::Org | + + // Build system files + LanguageType::Makefile | + LanguageType::CMake | + LanguageType::Dockerfile | + LanguageType::Autoconf | + LanguageType::MsBuild | + LanguageType::Meson | + LanguageType::Scons | + LanguageType::Bazel | + LanguageType::Nix | + + // Shell scripts (debatable, but often just build/deploy automation) + LanguageType::Batch | + LanguageType::PowerShell | + + // Other non-programming files + LanguageType::Svg | + LanguageType::Hex | + LanguageType::Protobuf | + LanguageType::Thrift + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use claims::{assert_none, assert_some}; + + #[test] + fn test_empty() { + let stats = LinecountStats::new(); + insta::assert_json_snapshot!(stats, @r#" + { + "languages": {}, + "total_code_lines": 0, + "total_comment_lines": 0 + } + "#); + } + + #[test] + fn test_add_file() { + let mut stats = LinecountStats::new(); + + // Add a Rust file + let rust_code = b"// This is a comment\nfn main() {\n println!(\"Hello\");\n}"; + stats.add_file(LanguageType::Rust, rust_code); + + insta::assert_json_snapshot!(stats, @r#" + { + "languages": { + "Rust": { + "code_lines": 3, + "comment_lines": 1, + "files": 1 + } + }, + "total_code_lines": 3, + "total_comment_lines": 1 + } + "#); + } + + #[test] + fn test_workflow() { + let mut stats = LinecountStats::new(); + + let files = [ + ("src/lib.rs", "pub fn hello() {}"), + ("tests/test.rs", "fn test() {}"), // Should be skipped + ("README.md", "# Hello"), // Should be skipped + ]; + + for (path, content) in files { + let path = Path::new(path); + if let Some(language_type) = should_count_path(path) { + stats.add_file(language_type, content.as_bytes()); + } + } + + insta::assert_json_snapshot!(stats, @r#" + { + "languages": { + "Rust": { + "code_lines": 1, + "comment_lines": 0, + "files": 1 + } + }, + "total_code_lines": 1, + "total_comment_lines": 0 + } + "#); + } + + #[test] + fn test_should_count_path() { + assert_none!(should_count_path(Path::new("src/tests/mod.rs"))); + assert_none!(should_count_path(Path::new("tests/integration.rs"))); + assert_none!(should_count_path(Path::new("examples/basic.rs"))); + assert_none!(should_count_path(Path::new("benches/bench.rs"))); + assert_some!(should_count_path(Path::new("src/lib.rs"))); + } + + #[test] + fn test_language_filtering() { + // Should count programming languages + assert!(is_countable_language(LanguageType::Rust)); + assert!(is_countable_language(LanguageType::JavaScript)); + assert!(is_countable_language(LanguageType::Html)); + assert!(is_countable_language(LanguageType::Css)); + + // Should skip config/data files + assert!(!is_countable_language(LanguageType::Json)); + assert!(!is_countable_language(LanguageType::Yaml)); + assert!(!is_countable_language(LanguageType::Toml)); + assert!(!is_countable_language(LanguageType::Markdown)); + } +} diff --git a/crates/crates_io_tarball/Cargo.toml b/crates/crates_io_tarball/Cargo.toml index b58301fd23c..875e27c5c53 100644 --- a/crates/crates_io_tarball/Cargo.toml +++ b/crates/crates_io_tarball/Cargo.toml @@ -13,6 +13,7 @@ builder = ["dep:flate2", "dep:tar"] [dependencies] astral-tokio-tar = "=0.5.2" cargo-manifest = "=0.19.1" +crates_io_linecount = { path = "../crates_io_linecount" } flate2 = { version = "=1.1.2", optional = true } serde = { version = "=1.0.219", features = ["derive"] } serde_json = "=1.0.140" diff --git a/crates/crates_io_tarball/src/lib.rs b/crates/crates_io_tarball/src/lib.rs index 43069670e5f..4773021ef08 100644 --- a/crates/crates_io_tarball/src/lib.rs +++ b/crates/crates_io_tarball/src/lib.rs @@ -30,6 +30,7 @@ const DEFAULT_BUF_SIZE: usize = 128 * 1024; pub struct TarballInfo { pub manifest: Manifest, pub vcs_info: Option, + pub linecount_stats: crates_io_linecount::LinecountStats, } #[derive(Debug, thiserror::Error)] @@ -74,6 +75,7 @@ pub async fn process_tarball( let mut vcs_info = None; let mut paths = Vec::new(); let mut manifests = BTreeMap::new(); + let mut linecount_stats = crates_io_linecount::LinecountStats::new(); let mut entries = archive.entries()?; while let Some(entry) = entries.next().await { @@ -103,6 +105,12 @@ pub async fn process_tarball( paths.push(in_pkg_path.to_path_buf()); + // Check if this file should be counted for line statistics + let is_file = entry_type.is_file(); + let language_type_for_counting = is_file + .then(|| crates_io_linecount::should_count_path(in_pkg_path)) + .flatten(); + // Let's go hunting for the VCS info and crate manifest. The only valid place for these is // in the package root in the tarball. let in_pkg_path_str = in_pkg_path.to_string_lossy(); @@ -121,6 +129,11 @@ pub async fn process_tarball( validate_manifest(&manifest)?; manifests.insert(owned_entry_path, manifest); + } else if let Some(language_type) = language_type_for_counting { + // If this is a file that we want to count, read it and update the line count stats. + let mut contents = Vec::new(); + entry.read_to_end(&mut contents).await?; + linecount_stats.add_file(language_type, &contents); } } @@ -146,7 +159,11 @@ pub async fn process_tarball( manifest.complete_from_abstract_filesystem(&PathsFileSystem(paths))?; - Ok(TarballInfo { manifest, vcs_info }) + Ok(TarballInfo { + manifest, + vcs_info, + linecount_stats, + }) } struct PathsFileSystem(Vec); diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__app.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__app.snap index 50d2a1b6cc8..832c18c70ca 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__app.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__app.snap @@ -76,4 +76,15 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: { + Rust: LanguageStats { + code_lines: 1, + comment_lines: 0, + files: 1, + }, + }, + total_code_lines: 1, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib.snap index 7272d2dfa02..e19708bcac5 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib.snap @@ -80,4 +80,15 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: { + Rust: LanguageStats { + code_lines: 1, + comment_lines: 0, + files: 1, + }, + }, + total_code_lines: 1, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib_with_bins_and_example.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib_with_bins_and_example.snap index db43f0beddf..8cc87c7d28a 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib_with_bins_and_example.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__lib_with_bins_and_example.snap @@ -140,4 +140,15 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: { + Rust: LanguageStats { + code_lines: 3, + comment_lines: 0, + files: 3, + }, + }, + total_code_lines: 3, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test.snap index 7d368fe0afc..b86a5b4bf73 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test.snap @@ -57,4 +57,9 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_incomplete_vcs_info.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_incomplete_vcs_info.snap index 309d511eb9d..05ace48e6bf 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_incomplete_vcs_info.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_incomplete_vcs_info.snap @@ -61,4 +61,9 @@ TarballInfo { path_in_vcs: "", }, ), + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_lowercase_manifest.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_lowercase_manifest.snap index ecf1471317e..be81255d1cb 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_lowercase_manifest.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_lowercase_manifest.snap @@ -61,4 +61,9 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest.snap index a163d2768df..869571d5c9c 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest.snap @@ -71,4 +71,9 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_boolean_readme.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_boolean_readme.snap index b86b2eed48f..2ae6909db7c 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_boolean_readme.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_boolean_readme.snap @@ -63,4 +63,9 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_default_readme.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_default_readme.snap index 7d368fe0afc..b86a5b4bf73 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_default_readme.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_default_readme.snap @@ -57,4 +57,9 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_project.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_project.snap index caec023b7eb..116f2f81732 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_project.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_manifest_with_project.snap @@ -61,4 +61,9 @@ TarballInfo { badges: None, }, vcs_info: None, + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_vcs_info.snap b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_vcs_info.snap index 63ab7fb2053..62d2e52fe03 100644 --- a/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_vcs_info.snap +++ b/crates/crates_io_tarball/src/snapshots/crates_io_tarball__tests__process_tarball_test_vcs_info.snap @@ -61,4 +61,9 @@ TarballInfo { path_in_vcs: "path/in/vcs", }, ), + linecount_stats: LinecountStats { + languages: {}, + total_code_lines: 0, + total_comment_lines: 0, + }, } diff --git a/migrations/2025-06-26-183025_add-linecounts-column/down.sql b/migrations/2025-06-26-183025_add-linecounts-column/down.sql new file mode 100644 index 00000000000..af3ef3a98de --- /dev/null +++ b/migrations/2025-06-26-183025_add-linecounts-column/down.sql @@ -0,0 +1,3 @@ +-- Remove line count statistics column from versions table +ALTER TABLE versions +DROP COLUMN linecounts; \ No newline at end of file diff --git a/migrations/2025-06-26-183025_add-linecounts-column/up.sql b/migrations/2025-06-26-183025_add-linecounts-column/up.sql new file mode 100644 index 00000000000..59bf26b2d0f --- /dev/null +++ b/migrations/2025-06-26-183025_add-linecounts-column/up.sql @@ -0,0 +1,6 @@ +-- Add line count statistics column to versions table +ALTER TABLE versions +ADD COLUMN linecounts JSONB DEFAULT NULL; + +-- Add comment explaining the column +COMMENT ON COLUMN versions.linecounts IS 'Source Lines of Code statistics for this version, stored as JSON with language breakdown and totals.'; diff --git a/src/controllers/krate/publish.rs b/src/controllers/krate/publish.rs index e29e4e4c99d..4b9cd71e913 100644 --- a/src/controllers/krate/publish.rs +++ b/src/controllers/krate/publish.rs @@ -26,7 +26,7 @@ use sha2::{Digest, Sha256}; use std::collections::HashMap; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_util::io::StreamReader; -use tracing::{error, instrument}; +use tracing::{error, instrument, warn}; use url::Url; use crate::models::{ @@ -482,6 +482,10 @@ pub async fn publish(app: AppState, req: Parts, body: Body) -> AppResult AppResult