feat(repository): add language statistics analysis feature
- Remove data directory from gitignore to include language data - Add build script to parse linguist languages.yml and generate static mappings - Include serde and serde_yml dependencies for YAML parsing - Add lang_stats module with language detection and statistics calculation - Generate protobuf definitions for language statistics API endpoints - Implement GetLanguageStats RPC endpoint in repository server - Add comprehensive test suite for language statistics functionality - Include extension and filename based language detection logic - Implement binary file classification and group resolution features
This commit is contained in:
@@ -5,4 +5,3 @@
|
|||||||
.project
|
.project
|
||||||
.settings
|
.settings
|
||||||
.DS_Store
|
.DS_Store
|
||||||
data
|
|
||||||
Generated
+32
@@ -699,6 +699,7 @@ dependencies = [
|
|||||||
"ractor_cluster",
|
"ractor_cluster",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"serde_yml",
|
||||||
"sha2",
|
"sha2",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -1892,6 +1893,16 @@ version = "0.2.186"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libyml"
|
||||||
|
version = "0.0.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3302702afa434ffa30847a83305f0a69d6abd74293b6554c18ec85c7ef30c980"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "linux-raw-sys"
|
name = "linux-raw-sys"
|
||||||
version = "0.12.1"
|
version = "0.12.1"
|
||||||
@@ -2518,6 +2529,12 @@ version = "1.0.22"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ryu"
|
||||||
|
version = "1.0.23"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "same-file"
|
name = "same-file"
|
||||||
version = "1.0.6"
|
version = "1.0.6"
|
||||||
@@ -2582,6 +2599,21 @@ dependencies = [
|
|||||||
"zmij",
|
"zmij",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_yml"
|
||||||
|
version = "0.0.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "59e2dd588bf1597a252c3b920e0143eb99b0f76e4e082f4c92ce34fbc9e71ddd"
|
||||||
|
dependencies = [
|
||||||
|
"indexmap",
|
||||||
|
"itoa",
|
||||||
|
"libyml",
|
||||||
|
"memchr",
|
||||||
|
"ryu",
|
||||||
|
"serde",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha1"
|
name = "sha1"
|
||||||
version = "0.10.6"
|
version = "0.10.6"
|
||||||
|
|||||||
@@ -46,3 +46,5 @@ path = "main.rs"
|
|||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
tonic-prost-build = "0.14"
|
tonic-prost-build = "0.14"
|
||||||
|
serde_yml = "0.0.12"
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
|||||||
@@ -1,14 +1,18 @@
|
|||||||
|
use serde::Deserialize;
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
|
let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
|
||||||
let proto_dir = manifest_dir.join("proto");
|
let proto_dir = manifest_dir.join("proto");
|
||||||
|
let data_dir = manifest_dir.join("data");
|
||||||
let out_dir = PathBuf::from(std::env::var("OUT_DIR")?);
|
let out_dir = PathBuf::from(std::env::var("OUT_DIR")?);
|
||||||
|
|
||||||
fs::create_dir_all(&out_dir)?;
|
fs::create_dir_all(&out_dir)?;
|
||||||
clean_generated_files(&out_dir)?;
|
clean_generated_files(&out_dir)?;
|
||||||
|
|
||||||
|
// Proto compilation
|
||||||
let protos = proto_files(&proto_dir)?;
|
let protos = proto_files(&proto_dir)?;
|
||||||
for proto in &protos {
|
for proto in &protos {
|
||||||
println!("cargo:rerun-if-changed={}", proto.display());
|
println!("cargo:rerun-if-changed={}", proto.display());
|
||||||
@@ -23,9 +27,242 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||||||
.out_dir(&out_dir)
|
.out_dir(&out_dir)
|
||||||
.compile_protos(&protos, &[proto_dir])?;
|
.compile_protos(&protos, &[proto_dir])?;
|
||||||
|
|
||||||
|
// Linguist language stats generation
|
||||||
|
let languages_yml = data_dir.join("languages.yml");
|
||||||
|
println!("cargo:rerun-if-changed={}", languages_yml.display());
|
||||||
|
generate_linguist(&languages_yml, &out_dir)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct LanguageEntry {
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
lang_type: String,
|
||||||
|
#[serde(default)]
|
||||||
|
extensions: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
filenames: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
group: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn generate_linguist(
|
||||||
|
languages_yml: &Path,
|
||||||
|
out_dir: &Path,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let yaml_str = fs::read_to_string(languages_yml)?;
|
||||||
|
let languages: HashMap<String, LanguageEntry> = serde_yml::from_str(&yaml_str)?;
|
||||||
|
|
||||||
|
// Build extension → (language, type) mapping
|
||||||
|
// Track primary extensions (first listed for each language) for conflict resolution
|
||||||
|
let mut ext_map: Vec<(String, String, String)> = Vec::new();
|
||||||
|
let mut ext_primary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if primary
|
||||||
|
let mut ext_secondary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if secondary
|
||||||
|
// Build filename → (language, type) mapping
|
||||||
|
let mut fname_map: Vec<(String, String, String)> = Vec::new();
|
||||||
|
let mut fname_primary: HashMap<String, (String, String)> = HashMap::new();
|
||||||
|
let mut fname_secondary: HashMap<String, (String, String)> = HashMap::new();
|
||||||
|
// Build language → type mapping
|
||||||
|
let mut lang_type_map: Vec<(String, String)> = Vec::new();
|
||||||
|
// Build language → group mapping (for resolving group names)
|
||||||
|
let mut lang_group_map: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
|
// Process languages in alphabetical order (deterministic)
|
||||||
|
let mut sorted_langs: Vec<_> = languages.iter().collect();
|
||||||
|
sorted_langs.sort_by(|a, b| a.0.cmp(b.0));
|
||||||
|
|
||||||
|
for (name, entry) in &sorted_langs {
|
||||||
|
let resolved_type = entry.lang_type.clone();
|
||||||
|
lang_type_map.push((name.to_string(), resolved_type.clone()));
|
||||||
|
|
||||||
|
if let Some(ref group) = entry.group {
|
||||||
|
lang_group_map.insert(name.to_string(), group.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i, ext) in entry.extensions.iter().enumerate() {
|
||||||
|
let ext_lower = ext.to_lowercase();
|
||||||
|
if i == 0 {
|
||||||
|
// Primary extension - always prefer this
|
||||||
|
ext_primary
|
||||||
|
.entry(ext_lower)
|
||||||
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||||
|
} else {
|
||||||
|
// Secondary extension - only use if no primary claims it
|
||||||
|
ext_secondary
|
||||||
|
.entry(ext_lower)
|
||||||
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i, fname) in entry.filenames.iter().enumerate() {
|
||||||
|
if i == 0 {
|
||||||
|
fname_primary
|
||||||
|
.entry(fname.clone())
|
||||||
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||||
|
} else {
|
||||||
|
fname_secondary
|
||||||
|
.entry(fname.clone())
|
||||||
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge: primary wins over secondary, with explicit priority for known conflicts
|
||||||
|
// These are common extensions where linguist has multiple primary claims
|
||||||
|
let priority_overrides: HashMap<&str, &str> = [
|
||||||
|
(".rs", "Rust"), // RenderScript also claims .rs
|
||||||
|
(".md", "Markdown"), // GCC Machine Description also claims .md
|
||||||
|
(".r", "R"), // Rebol also claims .r
|
||||||
|
(".s", "Assembly"), // Multiple assemblers claim .s
|
||||||
|
(".ms", "MAXScript"), // Unix Assembly also claims .ms
|
||||||
|
(".g", "G-code"), // GAP also claims .g
|
||||||
|
(".m", "Objective-C"), // Mercury, MUF, etc. also claim .m
|
||||||
|
(".w", "CWeb"), // OpenSCAD also claims .w
|
||||||
|
(".q", "Q"), // KBD also claims .q
|
||||||
|
].iter().cloned().collect();
|
||||||
|
|
||||||
|
for (ext, (lang, ltype)) in ext_primary {
|
||||||
|
if let Some(&preferred) = priority_overrides.get(ext.as_str()) {
|
||||||
|
// Only use this entry if it matches the preferred language
|
||||||
|
if lang == preferred {
|
||||||
|
ext_map.push((ext, lang, ltype));
|
||||||
|
}
|
||||||
|
// Otherwise skip - the preferred language's entry will be added when we process it
|
||||||
|
} else {
|
||||||
|
ext_map.push((ext, lang, ltype));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Add preferred languages for any overrides that weren't added yet
|
||||||
|
for (&ext, &preferred) in &priority_overrides {
|
||||||
|
if !ext_map.iter().any(|(e, _, _)| e == ext) {
|
||||||
|
// Find the preferred language's entry
|
||||||
|
if let Some(entry) = languages.get(preferred)
|
||||||
|
&& entry.extensions.iter().any(|e| e.to_lowercase() == ext)
|
||||||
|
{
|
||||||
|
ext_map.push((ext.to_string(), preferred.to_string(), entry.lang_type.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (ext, (lang, ltype)) in ext_secondary {
|
||||||
|
if !ext_map.iter().any(|(e, _, _)| e == &ext) {
|
||||||
|
ext_map.push((ext, lang, ltype));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (fname, (lang, ltype)) in fname_primary {
|
||||||
|
fname_map.push((fname, lang, ltype));
|
||||||
|
}
|
||||||
|
for (fname, (lang, ltype)) in fname_secondary {
|
||||||
|
if !fname_map.iter().any(|(f, _, _)| f == &fname) {
|
||||||
|
fname_map.push((fname, lang, ltype));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort for deterministic output
|
||||||
|
ext_map.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
|
fname_map.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
|
lang_type_map.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
|
|
||||||
|
let mut code = String::with_capacity(512 * 1024);
|
||||||
|
|
||||||
|
// Extension → (language_name, lang_type) mapping
|
||||||
|
code.push_str("// Auto-generated from linguist languages.yml — do not edit manually.\n\n");
|
||||||
|
code.push_str("/// Extension to (language_name, type) mapping.\n");
|
||||||
|
code.push_str("/// Key is lowercase extension including the dot, e.g. \".rs\".\n");
|
||||||
|
code.push_str("pub static EXTENSION_MAP: &[(&str, &str, &str)] = &[\n");
|
||||||
|
for (ext, lang, ltype) in &ext_map {
|
||||||
|
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
|
||||||
|
escape_str(ext), escape_str(lang), escape_str(ltype)));
|
||||||
|
}
|
||||||
|
code.push_str("];\n\n");
|
||||||
|
|
||||||
|
// Filename → (language_name, lang_type) mapping
|
||||||
|
code.push_str("/// Filename to (language_name, type) mapping.\n");
|
||||||
|
code.push_str("/// Key is exact filename, e.g. \"Makefile\", \"Dockerfile\".\n");
|
||||||
|
code.push_str("pub static FILENAME_MAP: &[(&str, &str, &str)] = &[\n");
|
||||||
|
for (fname, lang, ltype) in &fname_map {
|
||||||
|
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
|
||||||
|
escape_str(fname), escape_str(lang), escape_str(ltype)));
|
||||||
|
}
|
||||||
|
code.push_str("];\n\n");
|
||||||
|
|
||||||
|
// Language name → type mapping
|
||||||
|
code.push_str("/// Language name to type mapping.\n");
|
||||||
|
code.push_str("pub static LANG_TYPE_MAP: &[(&str, &str)] = &[\n");
|
||||||
|
for (lang, ltype) in &lang_type_map {
|
||||||
|
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
|
||||||
|
escape_str(lang), escape_str(ltype)));
|
||||||
|
}
|
||||||
|
code.push_str("];\n\n");
|
||||||
|
|
||||||
|
// Language name → group mapping
|
||||||
|
code.push_str("/// Language name to parent group mapping.\n");
|
||||||
|
code.push_str("pub static LANG_GROUP_MAP: &[(&str, &str)] = &[\n");
|
||||||
|
let mut group_vec: Vec<_> = lang_group_map.iter().collect();
|
||||||
|
group_vec.sort_by(|a, b| a.0.cmp(b.0));
|
||||||
|
for (lang, group) in group_vec {
|
||||||
|
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
|
||||||
|
escape_str(lang), escape_str(group)));
|
||||||
|
}
|
||||||
|
code.push_str("];\n\n");
|
||||||
|
|
||||||
|
// Binary extension classification
|
||||||
|
code.push_str("/// Binary media type classification for extensions.\n");
|
||||||
|
code.push_str("pub fn classify_binary_extension(ext: &str) -> &'static str {\n");
|
||||||
|
code.push_str(" match ext {\n");
|
||||||
|
|
||||||
|
// Image extensions
|
||||||
|
let image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
|
||||||
|
".webp", ".tiff", ".tif", ".psd", ".raw", ".heic", ".heif", ".avif",
|
||||||
|
".apng", ".jfif", ".pjpeg", ".pjp"];
|
||||||
|
for ext in &image_exts {
|
||||||
|
code.push_str(&format!(" \"{}\" => \"Image\",\n", ext));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Video extensions
|
||||||
|
let video_exts = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm",
|
||||||
|
".m4v", ".mpg", ".mpeg", ".3gp", ".3g2", ".ogv", ".vob"];
|
||||||
|
for ext in &video_exts {
|
||||||
|
code.push_str(&format!(" \"{}\" => \"Video\",\n", ext));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Audio extensions
|
||||||
|
let audio_exts = [".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a",
|
||||||
|
".opus", ".aiff", ".ape", ".alac", ".mid", ".midi"];
|
||||||
|
for ext in &audio_exts {
|
||||||
|
code.push_str(&format!(" \"{}\" => \"Audio\",\n", ext));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Font extensions
|
||||||
|
let font_exts = [".ttf", ".otf", ".woff", ".woff2", ".eot"];
|
||||||
|
for ext in &font_exts {
|
||||||
|
code.push_str(&format!(" \"{}\" => \"Font\",\n", ext));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Other binary
|
||||||
|
let binary_exts = [".exe", ".dll", ".so", ".dylib", ".a", ".lib", ".o",
|
||||||
|
".obj", ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".pyc", ".pyo",
|
||||||
|
".class", ".jar", ".war", ".ear", ".zip", ".tar", ".gz",
|
||||||
|
".bz2", ".xz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls",
|
||||||
|
".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".wasm", ".node"];
|
||||||
|
for ext in &binary_exts {
|
||||||
|
code.push_str(&format!(" \"{}\" => \"Binary\",\n", ext));
|
||||||
|
}
|
||||||
|
|
||||||
|
code.push_str(" _ => \"Binary\",\n");
|
||||||
|
code.push_str(" }\n");
|
||||||
|
code.push_str("}\n");
|
||||||
|
|
||||||
|
fs::write(out_dir.join("linguist_generated.rs"), code)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn escape_str(s: &str) -> String {
|
||||||
|
s.replace('\\', "\\\\")
|
||||||
|
.replace('"', "\\\"")
|
||||||
|
}
|
||||||
|
|
||||||
fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
|
fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
|
||||||
let mut files = fs::read_dir(proto_dir)?
|
let mut files = fs::read_dir(proto_dir)?
|
||||||
.map(|entry| entry.map(|entry| entry.path()))
|
.map(|entry| entry.map(|entry| entry.path()))
|
||||||
|
|||||||
+9438
File diff suppressed because it is too large
Load Diff
@@ -385,6 +385,30 @@ message GetRawChangesResponse {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
message GetLanguageStatsRequest {
|
||||||
|
RepositoryHeader repository = 1;
|
||||||
|
ObjectSelector revision = 2; // defaults to HEAD if unset
|
||||||
|
string path = 3; // optional: restrict to subdirectory
|
||||||
|
uint32 max_file_size = 4; // skip files larger than this (bytes, 0 = 512KB default)
|
||||||
|
}
|
||||||
|
|
||||||
|
message GetLanguageStatsResponse {
|
||||||
|
repeated LanguageStat languages = 1;
|
||||||
|
uint64 total_files = 2;
|
||||||
|
uint64 total_bytes = 3;
|
||||||
|
uint64 total_lines = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
message LanguageStat {
|
||||||
|
string language = 1; // language name, e.g. "Rust"
|
||||||
|
string lang_type = 2; // "programming", "markup", "data", "prose"
|
||||||
|
uint64 file_count = 3;
|
||||||
|
uint64 bytes = 4;
|
||||||
|
uint64 lines = 5;
|
||||||
|
double percentage = 6; // percentage by bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
message FetchRemoteRequest {
|
message FetchRemoteRequest {
|
||||||
RepositoryHeader repository = 1;
|
RepositoryHeader repository = 1;
|
||||||
string remote_url = 2;
|
string remote_url = 2;
|
||||||
@@ -453,4 +477,5 @@ service RepositoryService {
|
|||||||
rpc FindLicense(FindLicenseRequest) returns (FindLicenseResponse);
|
rpc FindLicense(FindLicenseRequest) returns (FindLicenseResponse);
|
||||||
rpc OptimizeRepository(OptimizeRepositoryRequest) returns (OptimizeRepositoryResponse);
|
rpc OptimizeRepository(OptimizeRepositoryRequest) returns (OptimizeRepositoryResponse);
|
||||||
rpc GetRawChanges(GetRawChangesRequest) returns (GetRawChangesResponse);
|
rpc GetRawChanges(GetRawChangesRequest) returns (GetRawChangesResponse);
|
||||||
|
rpc GetLanguageStats(GetLanguageStatsRequest) returns (GetLanguageStatsResponse);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,300 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use gix::object::tree::EntryKind;
|
||||||
|
|
||||||
|
use crate::bare::GitBare;
|
||||||
|
use crate::error::{GitError, GitResult};
|
||||||
|
use crate::pb::{
|
||||||
|
GetLanguageStatsRequest, GetLanguageStatsResponse, LanguageStat, object_selector,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Include the generated linguist rules
|
||||||
|
include!(concat!(env!("OUT_DIR"), "/linguist_generated.rs"));
|
||||||
|
|
||||||
|
/// Default max file size for line counting (512 KB).
|
||||||
|
const DEFAULT_MAX_FILE_SIZE: u32 = 512 * 1024;
|
||||||
|
|
||||||
|
/// Look up a language by file extension (case-insensitive, includes leading dot).
|
||||||
|
fn lookup_by_extension(ext: &str) -> Option<(&'static str, &'static str)> {
|
||||||
|
let ext_lower = ext.to_lowercase();
|
||||||
|
// Binary search on the sorted EXTENSION_MAP
|
||||||
|
EXTENSION_MAP
|
||||||
|
.binary_search_by(|&(e, _, _)| e.cmp(ext_lower.as_str()))
|
||||||
|
.ok()
|
||||||
|
.map(|idx| {
|
||||||
|
let (_, lang, ltype) = EXTENSION_MAP[idx];
|
||||||
|
(lang, ltype)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up a language by exact filename.
|
||||||
|
fn lookup_by_filename(name: &str) -> Option<(&'static str, &'static str)> {
|
||||||
|
FILENAME_MAP
|
||||||
|
.binary_search_by(|&(f, _, _)| f.cmp(name))
|
||||||
|
.ok()
|
||||||
|
.map(|idx| {
|
||||||
|
let (_, lang, ltype) = FILENAME_MAP[idx];
|
||||||
|
(lang, ltype)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve the group name for a language, if any.
|
||||||
|
fn resolve_group(lang: &str) -> Option<&'static str> {
|
||||||
|
LANG_GROUP_MAP
|
||||||
|
.binary_search_by_key(&lang, |&(l, _)| l)
|
||||||
|
.ok()
|
||||||
|
.map(|idx| LANG_GROUP_MAP[idx].1)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect language for a file path.
|
||||||
|
/// Returns (language_name, lang_type) or None if unknown.
|
||||||
|
fn detect_language(path: &str, is_binary: bool) -> Option<(&'static str, &'static str)> {
|
||||||
|
let file_name = Path::new(path)
|
||||||
|
.file_name()
|
||||||
|
.and_then(|n| n.to_str())
|
||||||
|
.unwrap_or("");
|
||||||
|
|
||||||
|
// Try filename match first (e.g., Makefile, Dockerfile)
|
||||||
|
if let Some(result) = lookup_by_filename(file_name) {
|
||||||
|
tracing::debug!(path = %path, lang = result.0, "matched by filename");
|
||||||
|
return Some(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try extension match
|
||||||
|
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
|
||||||
|
let ext_with_dot = format!(".{ext}");
|
||||||
|
if let Some(result) = lookup_by_extension(&ext_with_dot) {
|
||||||
|
tracing::debug!(path = %path, ext = %ext_with_dot, lang = result.0, "matched by extension");
|
||||||
|
return Some(result);
|
||||||
|
}
|
||||||
|
tracing::debug!(path = %path, ext = %ext_with_dot, "extension not found in map");
|
||||||
|
} else {
|
||||||
|
tracing::debug!(path = %path, "no extension found");
|
||||||
|
}
|
||||||
|
|
||||||
|
// For binary files with no recognized extension, classify by media type
|
||||||
|
if is_binary {
|
||||||
|
// Try extension-based binary classification
|
||||||
|
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
|
||||||
|
let ext_lower = format!(".{ext}").to_lowercase();
|
||||||
|
let media_type = classify_binary_extension(&ext_lower);
|
||||||
|
// Return as a synthetic language name
|
||||||
|
return Some((media_type, "data"));
|
||||||
|
}
|
||||||
|
return Some(("Binary", "data"));
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count non-empty lines in data.
|
||||||
|
fn count_code_lines(data: &[u8]) -> u64 {
|
||||||
|
let mut count = 0u64;
|
||||||
|
for line in data.split(|&b| b == b'\n') {
|
||||||
|
if !line.is_empty() && !line.iter().all(|b| b.is_ascii_whitespace()) {
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Aggregated stats per language.
|
||||||
|
#[derive(Default)]
|
||||||
|
struct LangStats {
|
||||||
|
file_count: u64,
|
||||||
|
bytes: u64,
|
||||||
|
lines: u64,
|
||||||
|
lang_type: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Context passed through recursive tree walking.
|
||||||
|
struct WalkContext<'a> {
|
||||||
|
max_file_size: u32,
|
||||||
|
stats: &'a mut HashMap<String, LangStats>,
|
||||||
|
total_files: &'a mut u64,
|
||||||
|
total_bytes: &'a mut u64,
|
||||||
|
total_lines: &'a mut u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GitBare {
|
||||||
|
pub fn get_language_stats(
|
||||||
|
&self,
|
||||||
|
request: GetLanguageStatsRequest,
|
||||||
|
) -> GitResult<GetLanguageStatsResponse> {
|
||||||
|
let repo = self.gix_repo()?;
|
||||||
|
let revision = match request.revision.clone().and_then(|s| s.selector) {
|
||||||
|
Some(object_selector::Selector::Oid(oid)) => oid.hex,
|
||||||
|
Some(object_selector::Selector::Revision(name)) => {
|
||||||
|
crate::sanitize::validate_revision(&name.revision)?;
|
||||||
|
name.revision
|
||||||
|
}
|
||||||
|
None => "HEAD".into(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let max_file_size = if request.max_file_size == 0 {
|
||||||
|
DEFAULT_MAX_FILE_SIZE
|
||||||
|
} else {
|
||||||
|
request.max_file_size
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tree = repo
|
||||||
|
.rev_parse_single(format!("{}^{{tree}}", revision).as_str())?
|
||||||
|
.object()?
|
||||||
|
.try_into_tree()
|
||||||
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||||
|
|
||||||
|
// If path is specified, descend into subdirectory
|
||||||
|
if !request.path.is_empty() {
|
||||||
|
let entry = tree
|
||||||
|
.lookup_entry_by_path(&request.path)?
|
||||||
|
.ok_or_else(|| GitError::NotFound(request.path.clone()))?;
|
||||||
|
tree = entry
|
||||||
|
.object()?
|
||||||
|
.try_into_tree()
|
||||||
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let prefix = request.path.trim_matches('/').to_string();
|
||||||
|
let mut stats: HashMap<String, LangStats> = HashMap::new();
|
||||||
|
let mut total_files = 0u64;
|
||||||
|
let mut total_bytes = 0u64;
|
||||||
|
let mut total_lines = 0u64;
|
||||||
|
|
||||||
|
let mut ctx = WalkContext {
|
||||||
|
max_file_size,
|
||||||
|
stats: &mut stats,
|
||||||
|
total_files: &mut total_files,
|
||||||
|
total_bytes: &mut total_bytes,
|
||||||
|
total_lines: &mut total_lines,
|
||||||
|
};
|
||||||
|
self.walk_tree(&repo, &tree, &prefix, &mut ctx)?;
|
||||||
|
|
||||||
|
// Resolve groups: merge child language stats into parent group
|
||||||
|
tracing::info!(
|
||||||
|
total_files,
|
||||||
|
total_bytes,
|
||||||
|
total_lines,
|
||||||
|
languages_found = stats.len(),
|
||||||
|
"raw language stats before group resolution"
|
||||||
|
);
|
||||||
|
let mut resolved: HashMap<String, LangStats> = HashMap::new();
|
||||||
|
for (lang, s) in stats {
|
||||||
|
let target = resolve_group(&lang).unwrap_or(&lang);
|
||||||
|
let entry = resolved.entry(target.to_string()).or_insert_with(|| LangStats {
|
||||||
|
lang_type: s.lang_type.clone(),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
entry.file_count += s.file_count;
|
||||||
|
entry.bytes += s.bytes;
|
||||||
|
entry.lines += s.lines;
|
||||||
|
// Keep the lang_type from the parent (or first encountered)
|
||||||
|
if entry.lang_type.is_empty() {
|
||||||
|
entry.lang_type = s.lang_type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build response sorted by bytes descending
|
||||||
|
let mut languages: Vec<LanguageStat> = resolved
|
||||||
|
.into_iter()
|
||||||
|
.map(|(language, s)| {
|
||||||
|
let percentage = if total_bytes > 0 {
|
||||||
|
(s.bytes as f64 / total_bytes as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
LanguageStat {
|
||||||
|
language,
|
||||||
|
lang_type: s.lang_type,
|
||||||
|
file_count: s.file_count,
|
||||||
|
bytes: s.bytes,
|
||||||
|
lines: s.lines,
|
||||||
|
percentage,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
languages.sort_by(|a, b| b.bytes.cmp(&a.bytes).then_with(|| a.language.cmp(&b.language)));
|
||||||
|
|
||||||
|
Ok(GetLanguageStatsResponse {
|
||||||
|
languages,
|
||||||
|
total_files,
|
||||||
|
total_bytes,
|
||||||
|
total_lines,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn walk_tree(
|
||||||
|
&self,
|
||||||
|
_repo: &gix::Repository,
|
||||||
|
tree: &gix::Tree<'_>,
|
||||||
|
prefix: &str,
|
||||||
|
ctx: &mut WalkContext<'_>,
|
||||||
|
) -> GitResult<()> {
|
||||||
|
for entry in tree.iter() {
|
||||||
|
let entry = entry?;
|
||||||
|
let name = String::from_utf8_lossy(entry.filename()).into_owned();
|
||||||
|
let path = if prefix.is_empty() {
|
||||||
|
name.clone()
|
||||||
|
} else {
|
||||||
|
format!("{prefix}/{name}")
|
||||||
|
};
|
||||||
|
|
||||||
|
match entry.kind() {
|
||||||
|
EntryKind::Tree => {
|
||||||
|
let child_tree = entry
|
||||||
|
.object()?
|
||||||
|
.try_into_tree()
|
||||||
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||||
|
self.walk_tree(_repo, &child_tree, &path, ctx)?;
|
||||||
|
}
|
||||||
|
EntryKind::Blob | EntryKind::BlobExecutable => {
|
||||||
|
let blob = entry
|
||||||
|
.object()?
|
||||||
|
.try_into_blob()
|
||||||
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||||
|
let data = &blob.data;
|
||||||
|
let size = data.len() as u64;
|
||||||
|
|
||||||
|
// Skip empty files
|
||||||
|
if size == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if binary (contains null byte)
|
||||||
|
let is_binary = data.contains(&0);
|
||||||
|
|
||||||
|
// Detect language
|
||||||
|
let Some((lang_name, lang_type)) = detect_language(&path, is_binary) else {
|
||||||
|
tracing::debug!(path = %path, is_binary, "no language detected");
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let lang_key = lang_name.to_string();
|
||||||
|
|
||||||
|
// Count code lines only for non-binary files within size limit
|
||||||
|
let lines = if !is_binary && (size as u32) <= ctx.max_file_size {
|
||||||
|
count_code_lines(data)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
|
||||||
|
*ctx.total_files += 1;
|
||||||
|
*ctx.total_bytes += size;
|
||||||
|
*ctx.total_lines += lines;
|
||||||
|
|
||||||
|
let s = ctx.stats.entry(lang_key.clone()).or_insert_with(|| LangStats {
|
||||||
|
lang_type: lang_type.to_string(),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
s.file_count += 1;
|
||||||
|
s.bytes += size;
|
||||||
|
s.lines += lines;
|
||||||
|
}
|
||||||
|
_ => {} // Skip symlinks, submodules
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
pub mod find_license;
|
pub mod find_license;
|
||||||
pub mod find_merge_base;
|
pub mod find_merge_base;
|
||||||
|
pub mod lang_stats;
|
||||||
pub mod objects_size;
|
pub mod objects_size;
|
||||||
pub mod optimize;
|
pub mod optimize;
|
||||||
pub mod raw_changes;
|
pub mod raw_changes;
|
||||||
|
|||||||
@@ -850,4 +850,21 @@ impl repository_service_server::RepositoryService for GitksService {
|
|||||||
m.record("ok");
|
m.record("ok");
|
||||||
Ok(tonic::Response::new(resp))
|
Ok(tonic::Response::new(resp))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_language_stats(
|
||||||
|
&self,
|
||||||
|
request: tonic::Request<GetLanguageStatsRequest>,
|
||||||
|
) -> Result<tonic::Response<GetLanguageStatsResponse>, tonic::Status> {
|
||||||
|
let m = crate::metrics::RequestMetrics::new("gitks.RepositoryService/GetLanguageStats");
|
||||||
|
let inner = request.into_inner();
|
||||||
|
let _rate = self.acquire_rate_limit(inner.repository.as_ref()).await?;
|
||||||
|
let repo = self.repo_label(inner.repository.as_ref());
|
||||||
|
let span = tracing::info_span!("repo.get_language_stats", %repo);
|
||||||
|
let _enter = span.enter();
|
||||||
|
let gb = self.resolve(inner.repository.as_ref())?;
|
||||||
|
let resp = gb.get_language_stats(inner).map_err(into_status)?;
|
||||||
|
tracing::info!(%repo, languages = resp.languages.len(), "language stats done");
|
||||||
|
m.record("ok");
|
||||||
|
Ok(tonic::Response::new(resp))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,150 @@
|
|||||||
|
mod common;
|
||||||
|
|
||||||
|
use gitks::pb::GetLanguageStatsRequest;
|
||||||
|
use gitks::repository::lang_stats::{EXTENSION_MAP, FILENAME_MAP};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extension_map_lookup() {
|
||||||
|
// Verify .md is in the map
|
||||||
|
let result = EXTENSION_MAP.binary_search_by(|&(e, _, _)| e.cmp(".md"));
|
||||||
|
assert!(result.is_ok(), ".md should be in EXTENSION_MAP, got {:?}", result);
|
||||||
|
let idx = result.unwrap();
|
||||||
|
assert_eq!(EXTENSION_MAP[idx].1, "Markdown");
|
||||||
|
assert_eq!(EXTENSION_MAP[idx].2, "prose");
|
||||||
|
|
||||||
|
// Verify .rs is in the map
|
||||||
|
let result = EXTENSION_MAP.binary_search_by(|&(e, _, _)| e.cmp(".rs"));
|
||||||
|
assert!(result.is_ok(), ".rs should be in EXTENSION_MAP");
|
||||||
|
let idx = result.unwrap();
|
||||||
|
assert_eq!(EXTENSION_MAP[idx].1, "Rust");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_filename_map_lookup() {
|
||||||
|
// Verify Makefile is in the map
|
||||||
|
let result = FILENAME_MAP.binary_search_by(|&(f, _, _)| f.cmp("Makefile"));
|
||||||
|
assert!(result.is_ok(), "Makefile should be in FILENAME_MAP");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_language_stats_basic() {
|
||||||
|
let (_dir, gb) = common::setup_bare_repo();
|
||||||
|
|
||||||
|
let resp = gb
|
||||||
|
.get_language_stats(GetLanguageStatsRequest {
|
||||||
|
repository: None,
|
||||||
|
revision: None,
|
||||||
|
path: String::new(),
|
||||||
|
max_file_size: 0,
|
||||||
|
})
|
||||||
|
.expect("get_language_stats");
|
||||||
|
|
||||||
|
// Should have some files
|
||||||
|
assert!(resp.total_files > 0, "expected some files");
|
||||||
|
assert!(resp.total_bytes > 0, "expected some bytes");
|
||||||
|
|
||||||
|
// Should detect Markdown (README.md)
|
||||||
|
let md = resp.languages.iter().find(|l| l.language == "Markdown");
|
||||||
|
assert!(md.is_some(), "should detect Markdown language");
|
||||||
|
let md = md.unwrap();
|
||||||
|
assert!(md.file_count > 0);
|
||||||
|
assert!(md.bytes > 0);
|
||||||
|
assert!(md.lines > 0);
|
||||||
|
|
||||||
|
// Should detect Rust (src/lib/mod.rs)
|
||||||
|
let rust = resp.languages.iter().find(|l| l.language == "Rust");
|
||||||
|
assert!(rust.is_some(), "should detect Rust language");
|
||||||
|
let rust = rust.unwrap();
|
||||||
|
assert!(rust.file_count > 0);
|
||||||
|
|
||||||
|
// Percentages should sum to ~100%
|
||||||
|
let total_pct: f64 = resp.languages.iter().map(|l| l.percentage).sum();
|
||||||
|
assert!(
|
||||||
|
(total_pct - 100.0).abs() < 0.01,
|
||||||
|
"percentages should sum to 100, got {total_pct}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Languages should be sorted by bytes descending
|
||||||
|
for i in 1..resp.languages.len() {
|
||||||
|
assert!(
|
||||||
|
resp.languages[i - 1].bytes >= resp.languages[i].bytes,
|
||||||
|
"languages should be sorted by bytes descending"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_language_stats_lang_type() {
|
||||||
|
let (_dir, gb) = common::setup_bare_repo();
|
||||||
|
|
||||||
|
let resp = gb
|
||||||
|
.get_language_stats(GetLanguageStatsRequest {
|
||||||
|
repository: None,
|
||||||
|
revision: None,
|
||||||
|
path: String::new(),
|
||||||
|
max_file_size: 0,
|
||||||
|
})
|
||||||
|
.expect("get_language_stats");
|
||||||
|
|
||||||
|
// Markdown should be "prose" type
|
||||||
|
let md = resp.languages.iter().find(|l| l.language == "Markdown");
|
||||||
|
if let Some(md) = md {
|
||||||
|
assert_eq!(md.lang_type, "prose", "Markdown should be prose type");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rust should be "programming" type
|
||||||
|
let rust = resp.languages.iter().find(|l| l.language == "Rust");
|
||||||
|
if let Some(rust) = rust {
|
||||||
|
assert_eq!(
|
||||||
|
rust.lang_type, "programming",
|
||||||
|
"Rust should be programming type"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_language_stats_with_path() {
|
||||||
|
let (_dir, gb) = common::setup_bare_repo();
|
||||||
|
|
||||||
|
// Restrict to "src" subdirectory
|
||||||
|
let resp = gb
|
||||||
|
.get_language_stats(GetLanguageStatsRequest {
|
||||||
|
repository: None,
|
||||||
|
revision: None,
|
||||||
|
path: "src".to_string(),
|
||||||
|
max_file_size: 0,
|
||||||
|
})
|
||||||
|
.expect("get_language_stats");
|
||||||
|
|
||||||
|
// Should find Rust files in src/
|
||||||
|
let rust = resp.languages.iter().find(|l| l.language == "Rust");
|
||||||
|
assert!(rust.is_some(), "should find Rust in src/ directory");
|
||||||
|
|
||||||
|
// Should NOT find README.md (it's at root level)
|
||||||
|
let md = resp.languages.iter().find(|l| l.language == "Markdown");
|
||||||
|
assert!(
|
||||||
|
md.is_none(),
|
||||||
|
"should not find Markdown in src/ directory"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_language_stats_line_count_excludes_blank_lines() {
|
||||||
|
let (_dir, gb) = common::setup_bare_repo();
|
||||||
|
|
||||||
|
let resp = gb
|
||||||
|
.get_language_stats(GetLanguageStatsRequest {
|
||||||
|
repository: None,
|
||||||
|
revision: None,
|
||||||
|
path: String::new(),
|
||||||
|
max_file_size: 0,
|
||||||
|
})
|
||||||
|
.expect("get_language_stats");
|
||||||
|
|
||||||
|
// README.md has "# Test\n\nUpdated.\n" = 3 lines but only 2 non-blank lines
|
||||||
|
let md = resp.languages.iter().find(|l| l.language == "Markdown");
|
||||||
|
if let Some(md) = md {
|
||||||
|
// README.md: "# Test" and "Updated." are non-blank = 2 lines
|
||||||
|
assert!(md.lines >= 2, "should count at least 2 code lines for README.md");
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user