feat(repository): add language statistics analysis feature

- Remove data directory from gitignore to include language data
- Add build script to parse linguist languages.yml and generate static mappings
- Include serde and serde_yml dependencies for YAML parsing
- Add lang_stats module with language detection and statistics calculation
- Generate protobuf definitions for language statistics API endpoints
- Implement GetLanguageStats RPC endpoint in repository server
- Add comprehensive test suite for language statistics functionality
- Include extension and filename based language detection logic
- Implement binary file classification and group resolution features
This commit is contained in:
zhenyi
2026-06-10 13:06:59 +08:00
parent 9a0c26e5f6
commit 939931acad
10 changed files with 10202 additions and 1 deletions
-1
View File
@@ -5,4 +5,3 @@
.project .project
.settings .settings
.DS_Store .DS_Store
data
Generated
+32
View File
@@ -699,6 +699,7 @@ dependencies = [
"ractor_cluster", "ractor_cluster",
"serde", "serde",
"serde_json", "serde_json",
"serde_yml",
"sha2", "sha2",
"tempfile", "tempfile",
"thiserror", "thiserror",
@@ -1892,6 +1893,16 @@ version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "libyml"
version = "0.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3302702afa434ffa30847a83305f0a69d6abd74293b6554c18ec85c7ef30c980"
dependencies = [
"anyhow",
"version_check",
]
[[package]] [[package]]
name = "linux-raw-sys" name = "linux-raw-sys"
version = "0.12.1" version = "0.12.1"
@@ -2518,6 +2529,12 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "ryu"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
[[package]] [[package]]
name = "same-file" name = "same-file"
version = "1.0.6" version = "1.0.6"
@@ -2582,6 +2599,21 @@ dependencies = [
"zmij", "zmij",
] ]
[[package]]
name = "serde_yml"
version = "0.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59e2dd588bf1597a252c3b920e0143eb99b0f76e4e082f4c92ce34fbc9e71ddd"
dependencies = [
"indexmap",
"itoa",
"libyml",
"memchr",
"ryu",
"serde",
"version_check",
]
[[package]] [[package]]
name = "sha1" name = "sha1"
version = "0.10.6" version = "0.10.6"
+2
View File
@@ -46,3 +46,5 @@ path = "main.rs"
[build-dependencies] [build-dependencies]
tonic-prost-build = "0.14" tonic-prost-build = "0.14"
serde_yml = "0.0.12"
serde = { version = "1", features = ["derive"] }
+237
View File
@@ -1,14 +1,18 @@
use serde::Deserialize;
use std::collections::HashMap;
use std::fs; use std::fs;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
fn main() -> Result<(), Box<dyn std::error::Error>> { fn main() -> Result<(), Box<dyn std::error::Error>> {
let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?); let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
let proto_dir = manifest_dir.join("proto"); let proto_dir = manifest_dir.join("proto");
let data_dir = manifest_dir.join("data");
let out_dir = PathBuf::from(std::env::var("OUT_DIR")?); let out_dir = PathBuf::from(std::env::var("OUT_DIR")?);
fs::create_dir_all(&out_dir)?; fs::create_dir_all(&out_dir)?;
clean_generated_files(&out_dir)?; clean_generated_files(&out_dir)?;
// Proto compilation
let protos = proto_files(&proto_dir)?; let protos = proto_files(&proto_dir)?;
for proto in &protos { for proto in &protos {
println!("cargo:rerun-if-changed={}", proto.display()); println!("cargo:rerun-if-changed={}", proto.display());
@@ -23,9 +27,242 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.out_dir(&out_dir) .out_dir(&out_dir)
.compile_protos(&protos, &[proto_dir])?; .compile_protos(&protos, &[proto_dir])?;
// Linguist language stats generation
let languages_yml = data_dir.join("languages.yml");
println!("cargo:rerun-if-changed={}", languages_yml.display());
generate_linguist(&languages_yml, &out_dir)?;
Ok(()) Ok(())
} }
#[derive(Deserialize)]
struct LanguageEntry {
#[serde(rename = "type")]
lang_type: String,
#[serde(default)]
extensions: Vec<String>,
#[serde(default)]
filenames: Vec<String>,
#[serde(default)]
group: Option<String>,
}
fn generate_linguist(
languages_yml: &Path,
out_dir: &Path,
) -> Result<(), Box<dyn std::error::Error>> {
let yaml_str = fs::read_to_string(languages_yml)?;
let languages: HashMap<String, LanguageEntry> = serde_yml::from_str(&yaml_str)?;
// Build extension → (language, type) mapping
// Track primary extensions (first listed for each language) for conflict resolution
let mut ext_map: Vec<(String, String, String)> = Vec::new();
let mut ext_primary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if primary
let mut ext_secondary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if secondary
// Build filename → (language, type) mapping
let mut fname_map: Vec<(String, String, String)> = Vec::new();
let mut fname_primary: HashMap<String, (String, String)> = HashMap::new();
let mut fname_secondary: HashMap<String, (String, String)> = HashMap::new();
// Build language → type mapping
let mut lang_type_map: Vec<(String, String)> = Vec::new();
// Build language → group mapping (for resolving group names)
let mut lang_group_map: HashMap<String, String> = HashMap::new();
// Process languages in alphabetical order (deterministic)
let mut sorted_langs: Vec<_> = languages.iter().collect();
sorted_langs.sort_by(|a, b| a.0.cmp(b.0));
for (name, entry) in &sorted_langs {
let resolved_type = entry.lang_type.clone();
lang_type_map.push((name.to_string(), resolved_type.clone()));
if let Some(ref group) = entry.group {
lang_group_map.insert(name.to_string(), group.clone());
}
for (i, ext) in entry.extensions.iter().enumerate() {
let ext_lower = ext.to_lowercase();
if i == 0 {
// Primary extension - always prefer this
ext_primary
.entry(ext_lower)
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
} else {
// Secondary extension - only use if no primary claims it
ext_secondary
.entry(ext_lower)
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
}
}
for (i, fname) in entry.filenames.iter().enumerate() {
if i == 0 {
fname_primary
.entry(fname.clone())
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
} else {
fname_secondary
.entry(fname.clone())
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
}
}
}
// Merge: primary wins over secondary, with explicit priority for known conflicts
// These are common extensions where linguist has multiple primary claims
let priority_overrides: HashMap<&str, &str> = [
(".rs", "Rust"), // RenderScript also claims .rs
(".md", "Markdown"), // GCC Machine Description also claims .md
(".r", "R"), // Rebol also claims .r
(".s", "Assembly"), // Multiple assemblers claim .s
(".ms", "MAXScript"), // Unix Assembly also claims .ms
(".g", "G-code"), // GAP also claims .g
(".m", "Objective-C"), // Mercury, MUF, etc. also claim .m
(".w", "CWeb"), // OpenSCAD also claims .w
(".q", "Q"), // KBD also claims .q
].iter().cloned().collect();
for (ext, (lang, ltype)) in ext_primary {
if let Some(&preferred) = priority_overrides.get(ext.as_str()) {
// Only use this entry if it matches the preferred language
if lang == preferred {
ext_map.push((ext, lang, ltype));
}
// Otherwise skip - the preferred language's entry will be added when we process it
} else {
ext_map.push((ext, lang, ltype));
}
}
// Add preferred languages for any overrides that weren't added yet
for (&ext, &preferred) in &priority_overrides {
if !ext_map.iter().any(|(e, _, _)| e == ext) {
// Find the preferred language's entry
if let Some(entry) = languages.get(preferred)
&& entry.extensions.iter().any(|e| e.to_lowercase() == ext)
{
ext_map.push((ext.to_string(), preferred.to_string(), entry.lang_type.clone()));
}
}
}
for (ext, (lang, ltype)) in ext_secondary {
if !ext_map.iter().any(|(e, _, _)| e == &ext) {
ext_map.push((ext, lang, ltype));
}
}
for (fname, (lang, ltype)) in fname_primary {
fname_map.push((fname, lang, ltype));
}
for (fname, (lang, ltype)) in fname_secondary {
if !fname_map.iter().any(|(f, _, _)| f == &fname) {
fname_map.push((fname, lang, ltype));
}
}
// Sort for deterministic output
ext_map.sort_by(|a, b| a.0.cmp(&b.0));
fname_map.sort_by(|a, b| a.0.cmp(&b.0));
lang_type_map.sort_by(|a, b| a.0.cmp(&b.0));
let mut code = String::with_capacity(512 * 1024);
// Extension → (language_name, lang_type) mapping
code.push_str("// Auto-generated from linguist languages.yml — do not edit manually.\n\n");
code.push_str("/// Extension to (language_name, type) mapping.\n");
code.push_str("/// Key is lowercase extension including the dot, e.g. \".rs\".\n");
code.push_str("pub static EXTENSION_MAP: &[(&str, &str, &str)] = &[\n");
for (ext, lang, ltype) in &ext_map {
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
escape_str(ext), escape_str(lang), escape_str(ltype)));
}
code.push_str("];\n\n");
// Filename → (language_name, lang_type) mapping
code.push_str("/// Filename to (language_name, type) mapping.\n");
code.push_str("/// Key is exact filename, e.g. \"Makefile\", \"Dockerfile\".\n");
code.push_str("pub static FILENAME_MAP: &[(&str, &str, &str)] = &[\n");
for (fname, lang, ltype) in &fname_map {
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
escape_str(fname), escape_str(lang), escape_str(ltype)));
}
code.push_str("];\n\n");
// Language name → type mapping
code.push_str("/// Language name to type mapping.\n");
code.push_str("pub static LANG_TYPE_MAP: &[(&str, &str)] = &[\n");
for (lang, ltype) in &lang_type_map {
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
escape_str(lang), escape_str(ltype)));
}
code.push_str("];\n\n");
// Language name → group mapping
code.push_str("/// Language name to parent group mapping.\n");
code.push_str("pub static LANG_GROUP_MAP: &[(&str, &str)] = &[\n");
let mut group_vec: Vec<_> = lang_group_map.iter().collect();
group_vec.sort_by(|a, b| a.0.cmp(b.0));
for (lang, group) in group_vec {
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
escape_str(lang), escape_str(group)));
}
code.push_str("];\n\n");
// Binary extension classification
code.push_str("/// Binary media type classification for extensions.\n");
code.push_str("pub fn classify_binary_extension(ext: &str) -> &'static str {\n");
code.push_str(" match ext {\n");
// Image extensions
let image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
".webp", ".tiff", ".tif", ".psd", ".raw", ".heic", ".heif", ".avif",
".apng", ".jfif", ".pjpeg", ".pjp"];
for ext in &image_exts {
code.push_str(&format!(" \"{}\" => \"Image\",\n", ext));
}
// Video extensions
let video_exts = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm",
".m4v", ".mpg", ".mpeg", ".3gp", ".3g2", ".ogv", ".vob"];
for ext in &video_exts {
code.push_str(&format!(" \"{}\" => \"Video\",\n", ext));
}
// Audio extensions
let audio_exts = [".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a",
".opus", ".aiff", ".ape", ".alac", ".mid", ".midi"];
for ext in &audio_exts {
code.push_str(&format!(" \"{}\" => \"Audio\",\n", ext));
}
// Font extensions
let font_exts = [".ttf", ".otf", ".woff", ".woff2", ".eot"];
for ext in &font_exts {
code.push_str(&format!(" \"{}\" => \"Font\",\n", ext));
}
// Other binary
let binary_exts = [".exe", ".dll", ".so", ".dylib", ".a", ".lib", ".o",
".obj", ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".pyc", ".pyo",
".class", ".jar", ".war", ".ear", ".zip", ".tar", ".gz",
".bz2", ".xz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls",
".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".wasm", ".node"];
for ext in &binary_exts {
code.push_str(&format!(" \"{}\" => \"Binary\",\n", ext));
}
code.push_str(" _ => \"Binary\",\n");
code.push_str(" }\n");
code.push_str("}\n");
fs::write(out_dir.join("linguist_generated.rs"), code)?;
Ok(())
}
fn escape_str(s: &str) -> String {
s.replace('\\', "\\\\")
.replace('"', "\\\"")
}
fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> { fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
let mut files = fs::read_dir(proto_dir)? let mut files = fs::read_dir(proto_dir)?
.map(|entry| entry.map(|entry| entry.path())) .map(|entry| entry.map(|entry| entry.path()))
+9438
View File
File diff suppressed because it is too large Load Diff
+25
View File
@@ -385,6 +385,30 @@ message GetRawChangesResponse {
} }
message GetLanguageStatsRequest {
RepositoryHeader repository = 1;
ObjectSelector revision = 2; // defaults to HEAD if unset
string path = 3; // optional: restrict to subdirectory
uint32 max_file_size = 4; // skip files larger than this (bytes, 0 = 512KB default)
}
message GetLanguageStatsResponse {
repeated LanguageStat languages = 1;
uint64 total_files = 2;
uint64 total_bytes = 3;
uint64 total_lines = 4;
}
message LanguageStat {
string language = 1; // language name, e.g. "Rust"
string lang_type = 2; // "programming", "markup", "data", "prose"
uint64 file_count = 3;
uint64 bytes = 4;
uint64 lines = 5;
double percentage = 6; // percentage by bytes
}
message FetchRemoteRequest { message FetchRemoteRequest {
RepositoryHeader repository = 1; RepositoryHeader repository = 1;
string remote_url = 2; string remote_url = 2;
@@ -453,4 +477,5 @@ service RepositoryService {
rpc FindLicense(FindLicenseRequest) returns (FindLicenseResponse); rpc FindLicense(FindLicenseRequest) returns (FindLicenseResponse);
rpc OptimizeRepository(OptimizeRepositoryRequest) returns (OptimizeRepositoryResponse); rpc OptimizeRepository(OptimizeRepositoryRequest) returns (OptimizeRepositoryResponse);
rpc GetRawChanges(GetRawChangesRequest) returns (GetRawChangesResponse); rpc GetRawChanges(GetRawChangesRequest) returns (GetRawChangesResponse);
rpc GetLanguageStats(GetLanguageStatsRequest) returns (GetLanguageStatsResponse);
} }
+300
View File
@@ -0,0 +1,300 @@
use std::collections::HashMap;
use std::path::Path;
use gix::object::tree::EntryKind;
use crate::bare::GitBare;
use crate::error::{GitError, GitResult};
use crate::pb::{
GetLanguageStatsRequest, GetLanguageStatsResponse, LanguageStat, object_selector,
};
// Include the generated linguist rules
include!(concat!(env!("OUT_DIR"), "/linguist_generated.rs"));
/// Default max file size for line counting (512 KB).
const DEFAULT_MAX_FILE_SIZE: u32 = 512 * 1024;
/// Look up a language by file extension (case-insensitive, includes leading dot).
fn lookup_by_extension(ext: &str) -> Option<(&'static str, &'static str)> {
let ext_lower = ext.to_lowercase();
// Binary search on the sorted EXTENSION_MAP
EXTENSION_MAP
.binary_search_by(|&(e, _, _)| e.cmp(ext_lower.as_str()))
.ok()
.map(|idx| {
let (_, lang, ltype) = EXTENSION_MAP[idx];
(lang, ltype)
})
}
/// Look up a language by exact filename.
fn lookup_by_filename(name: &str) -> Option<(&'static str, &'static str)> {
FILENAME_MAP
.binary_search_by(|&(f, _, _)| f.cmp(name))
.ok()
.map(|idx| {
let (_, lang, ltype) = FILENAME_MAP[idx];
(lang, ltype)
})
}
/// Resolve the group name for a language, if any.
fn resolve_group(lang: &str) -> Option<&'static str> {
LANG_GROUP_MAP
.binary_search_by_key(&lang, |&(l, _)| l)
.ok()
.map(|idx| LANG_GROUP_MAP[idx].1)
}
/// Detect language for a file path.
/// Returns (language_name, lang_type) or None if unknown.
fn detect_language(path: &str, is_binary: bool) -> Option<(&'static str, &'static str)> {
let file_name = Path::new(path)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("");
// Try filename match first (e.g., Makefile, Dockerfile)
if let Some(result) = lookup_by_filename(file_name) {
tracing::debug!(path = %path, lang = result.0, "matched by filename");
return Some(result);
}
// Try extension match
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
let ext_with_dot = format!(".{ext}");
if let Some(result) = lookup_by_extension(&ext_with_dot) {
tracing::debug!(path = %path, ext = %ext_with_dot, lang = result.0, "matched by extension");
return Some(result);
}
tracing::debug!(path = %path, ext = %ext_with_dot, "extension not found in map");
} else {
tracing::debug!(path = %path, "no extension found");
}
// For binary files with no recognized extension, classify by media type
if is_binary {
// Try extension-based binary classification
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
let ext_lower = format!(".{ext}").to_lowercase();
let media_type = classify_binary_extension(&ext_lower);
// Return as a synthetic language name
return Some((media_type, "data"));
}
return Some(("Binary", "data"));
}
None
}
/// Count non-empty lines in data.
fn count_code_lines(data: &[u8]) -> u64 {
let mut count = 0u64;
for line in data.split(|&b| b == b'\n') {
if !line.is_empty() && !line.iter().all(|b| b.is_ascii_whitespace()) {
count += 1;
}
}
count
}
/// Aggregated stats per language.
#[derive(Default)]
struct LangStats {
file_count: u64,
bytes: u64,
lines: u64,
lang_type: String,
}
/// Context passed through recursive tree walking.
struct WalkContext<'a> {
max_file_size: u32,
stats: &'a mut HashMap<String, LangStats>,
total_files: &'a mut u64,
total_bytes: &'a mut u64,
total_lines: &'a mut u64,
}
impl GitBare {
pub fn get_language_stats(
&self,
request: GetLanguageStatsRequest,
) -> GitResult<GetLanguageStatsResponse> {
let repo = self.gix_repo()?;
let revision = match request.revision.clone().and_then(|s| s.selector) {
Some(object_selector::Selector::Oid(oid)) => oid.hex,
Some(object_selector::Selector::Revision(name)) => {
crate::sanitize::validate_revision(&name.revision)?;
name.revision
}
None => "HEAD".into(),
};
let max_file_size = if request.max_file_size == 0 {
DEFAULT_MAX_FILE_SIZE
} else {
request.max_file_size
};
let mut tree = repo
.rev_parse_single(format!("{}^{{tree}}", revision).as_str())?
.object()?
.try_into_tree()
.map_err(|e| GitError::Gix(e.to_string()))?;
// If path is specified, descend into subdirectory
if !request.path.is_empty() {
let entry = tree
.lookup_entry_by_path(&request.path)?
.ok_or_else(|| GitError::NotFound(request.path.clone()))?;
tree = entry
.object()?
.try_into_tree()
.map_err(|e| GitError::Gix(e.to_string()))?;
}
let prefix = request.path.trim_matches('/').to_string();
let mut stats: HashMap<String, LangStats> = HashMap::new();
let mut total_files = 0u64;
let mut total_bytes = 0u64;
let mut total_lines = 0u64;
let mut ctx = WalkContext {
max_file_size,
stats: &mut stats,
total_files: &mut total_files,
total_bytes: &mut total_bytes,
total_lines: &mut total_lines,
};
self.walk_tree(&repo, &tree, &prefix, &mut ctx)?;
// Resolve groups: merge child language stats into parent group
tracing::info!(
total_files,
total_bytes,
total_lines,
languages_found = stats.len(),
"raw language stats before group resolution"
);
let mut resolved: HashMap<String, LangStats> = HashMap::new();
for (lang, s) in stats {
let target = resolve_group(&lang).unwrap_or(&lang);
let entry = resolved.entry(target.to_string()).or_insert_with(|| LangStats {
lang_type: s.lang_type.clone(),
..Default::default()
});
entry.file_count += s.file_count;
entry.bytes += s.bytes;
entry.lines += s.lines;
// Keep the lang_type from the parent (or first encountered)
if entry.lang_type.is_empty() {
entry.lang_type = s.lang_type;
}
}
// Build response sorted by bytes descending
let mut languages: Vec<LanguageStat> = resolved
.into_iter()
.map(|(language, s)| {
let percentage = if total_bytes > 0 {
(s.bytes as f64 / total_bytes as f64) * 100.0
} else {
0.0
};
LanguageStat {
language,
lang_type: s.lang_type,
file_count: s.file_count,
bytes: s.bytes,
lines: s.lines,
percentage,
}
})
.collect();
languages.sort_by(|a, b| b.bytes.cmp(&a.bytes).then_with(|| a.language.cmp(&b.language)));
Ok(GetLanguageStatsResponse {
languages,
total_files,
total_bytes,
total_lines,
})
}
fn walk_tree(
&self,
_repo: &gix::Repository,
tree: &gix::Tree<'_>,
prefix: &str,
ctx: &mut WalkContext<'_>,
) -> GitResult<()> {
for entry in tree.iter() {
let entry = entry?;
let name = String::from_utf8_lossy(entry.filename()).into_owned();
let path = if prefix.is_empty() {
name.clone()
} else {
format!("{prefix}/{name}")
};
match entry.kind() {
EntryKind::Tree => {
let child_tree = entry
.object()?
.try_into_tree()
.map_err(|e| GitError::Gix(e.to_string()))?;
self.walk_tree(_repo, &child_tree, &path, ctx)?;
}
EntryKind::Blob | EntryKind::BlobExecutable => {
let blob = entry
.object()?
.try_into_blob()
.map_err(|e| GitError::Gix(e.to_string()))?;
let data = &blob.data;
let size = data.len() as u64;
// Skip empty files
if size == 0 {
continue;
}
// Check if binary (contains null byte)
let is_binary = data.contains(&0);
// Detect language
let Some((lang_name, lang_type)) = detect_language(&path, is_binary) else {
tracing::debug!(path = %path, is_binary, "no language detected");
continue;
};
let lang_key = lang_name.to_string();
// Count code lines only for non-binary files within size limit
let lines = if !is_binary && (size as u32) <= ctx.max_file_size {
count_code_lines(data)
} else {
0
};
*ctx.total_files += 1;
*ctx.total_bytes += size;
*ctx.total_lines += lines;
let s = ctx.stats.entry(lang_key.clone()).or_insert_with(|| LangStats {
lang_type: lang_type.to_string(),
..Default::default()
});
s.file_count += 1;
s.bytes += size;
s.lines += lines;
}
_ => {} // Skip symlinks, submodules
}
}
Ok(())
}
}
+1
View File
@@ -1,5 +1,6 @@
pub mod find_license; pub mod find_license;
pub mod find_merge_base; pub mod find_merge_base;
pub mod lang_stats;
pub mod objects_size; pub mod objects_size;
pub mod optimize; pub mod optimize;
pub mod raw_changes; pub mod raw_changes;
+17
View File
@@ -850,4 +850,21 @@ impl repository_service_server::RepositoryService for GitksService {
m.record("ok"); m.record("ok");
Ok(tonic::Response::new(resp)) Ok(tonic::Response::new(resp))
} }
async fn get_language_stats(
&self,
request: tonic::Request<GetLanguageStatsRequest>,
) -> Result<tonic::Response<GetLanguageStatsResponse>, tonic::Status> {
let m = crate::metrics::RequestMetrics::new("gitks.RepositoryService/GetLanguageStats");
let inner = request.into_inner();
let _rate = self.acquire_rate_limit(inner.repository.as_ref()).await?;
let repo = self.repo_label(inner.repository.as_ref());
let span = tracing::info_span!("repo.get_language_stats", %repo);
let _enter = span.enter();
let gb = self.resolve(inner.repository.as_ref())?;
let resp = gb.get_language_stats(inner).map_err(into_status)?;
tracing::info!(%repo, languages = resp.languages.len(), "language stats done");
m.record("ok");
Ok(tonic::Response::new(resp))
}
} }
+150
View File
@@ -0,0 +1,150 @@
mod common;
use gitks::pb::GetLanguageStatsRequest;
use gitks::repository::lang_stats::{EXTENSION_MAP, FILENAME_MAP};
#[test]
fn test_extension_map_lookup() {
// Verify .md is in the map
let result = EXTENSION_MAP.binary_search_by(|&(e, _, _)| e.cmp(".md"));
assert!(result.is_ok(), ".md should be in EXTENSION_MAP, got {:?}", result);
let idx = result.unwrap();
assert_eq!(EXTENSION_MAP[idx].1, "Markdown");
assert_eq!(EXTENSION_MAP[idx].2, "prose");
// Verify .rs is in the map
let result = EXTENSION_MAP.binary_search_by(|&(e, _, _)| e.cmp(".rs"));
assert!(result.is_ok(), ".rs should be in EXTENSION_MAP");
let idx = result.unwrap();
assert_eq!(EXTENSION_MAP[idx].1, "Rust");
}
#[test]
fn test_filename_map_lookup() {
// Verify Makefile is in the map
let result = FILENAME_MAP.binary_search_by(|&(f, _, _)| f.cmp("Makefile"));
assert!(result.is_ok(), "Makefile should be in FILENAME_MAP");
}
#[test]
fn test_language_stats_basic() {
let (_dir, gb) = common::setup_bare_repo();
let resp = gb
.get_language_stats(GetLanguageStatsRequest {
repository: None,
revision: None,
path: String::new(),
max_file_size: 0,
})
.expect("get_language_stats");
// Should have some files
assert!(resp.total_files > 0, "expected some files");
assert!(resp.total_bytes > 0, "expected some bytes");
// Should detect Markdown (README.md)
let md = resp.languages.iter().find(|l| l.language == "Markdown");
assert!(md.is_some(), "should detect Markdown language");
let md = md.unwrap();
assert!(md.file_count > 0);
assert!(md.bytes > 0);
assert!(md.lines > 0);
// Should detect Rust (src/lib/mod.rs)
let rust = resp.languages.iter().find(|l| l.language == "Rust");
assert!(rust.is_some(), "should detect Rust language");
let rust = rust.unwrap();
assert!(rust.file_count > 0);
// Percentages should sum to ~100%
let total_pct: f64 = resp.languages.iter().map(|l| l.percentage).sum();
assert!(
(total_pct - 100.0).abs() < 0.01,
"percentages should sum to 100, got {total_pct}"
);
// Languages should be sorted by bytes descending
for i in 1..resp.languages.len() {
assert!(
resp.languages[i - 1].bytes >= resp.languages[i].bytes,
"languages should be sorted by bytes descending"
);
}
}
#[test]
fn test_language_stats_lang_type() {
let (_dir, gb) = common::setup_bare_repo();
let resp = gb
.get_language_stats(GetLanguageStatsRequest {
repository: None,
revision: None,
path: String::new(),
max_file_size: 0,
})
.expect("get_language_stats");
// Markdown should be "prose" type
let md = resp.languages.iter().find(|l| l.language == "Markdown");
if let Some(md) = md {
assert_eq!(md.lang_type, "prose", "Markdown should be prose type");
}
// Rust should be "programming" type
let rust = resp.languages.iter().find(|l| l.language == "Rust");
if let Some(rust) = rust {
assert_eq!(
rust.lang_type, "programming",
"Rust should be programming type"
);
}
}
#[test]
fn test_language_stats_with_path() {
let (_dir, gb) = common::setup_bare_repo();
// Restrict to "src" subdirectory
let resp = gb
.get_language_stats(GetLanguageStatsRequest {
repository: None,
revision: None,
path: "src".to_string(),
max_file_size: 0,
})
.expect("get_language_stats");
// Should find Rust files in src/
let rust = resp.languages.iter().find(|l| l.language == "Rust");
assert!(rust.is_some(), "should find Rust in src/ directory");
// Should NOT find README.md (it's at root level)
let md = resp.languages.iter().find(|l| l.language == "Markdown");
assert!(
md.is_none(),
"should not find Markdown in src/ directory"
);
}
#[test]
fn test_language_stats_line_count_excludes_blank_lines() {
let (_dir, gb) = common::setup_bare_repo();
let resp = gb
.get_language_stats(GetLanguageStatsRequest {
repository: None,
revision: None,
path: String::new(),
max_file_size: 0,
})
.expect("get_language_stats");
// README.md has "# Test\n\nUpdated.\n" = 3 lines but only 2 non-blank lines
let md = resp.languages.iter().find(|l| l.language == "Markdown");
if let Some(md) = md {
// README.md: "# Test" and "Updated." are non-blank = 2 lines
assert!(md.lines >= 2, "should count at least 2 code lines for README.md");
}
}