use std::collections::HashMap; use std::path::Path; use gix::object::tree::EntryKind; use crate::bare::GitBare; use crate::error::{GitError, GitResult}; use crate::pb::{GetLanguageStatsRequest, GetLanguageStatsResponse, LanguageStat, object_selector}; include!(concat!(env!("OUT_DIR"), "/linguist_generated.rs")); /// Default max file size for line counting (512 KB). const DEFAULT_MAX_FILE_SIZE: u32 = 512 * 1024; const MAX_TREE_WALK_DEPTH: usize = 256; /// Look up a language by file extension (case-insensitive, includes leading dot). fn lookup_by_extension(ext: &str) -> Option<(&'static str, &'static str)> { let ext_lower = ext.to_lowercase(); EXTENSION_MAP .binary_search_by(|&(e, _, _)| e.cmp(ext_lower.as_str())) .ok() .map(|idx| { let (_, lang, ltype) = EXTENSION_MAP[idx]; (lang, ltype) }) } /// Look up a language by exact filename. fn lookup_by_filename(name: &str) -> Option<(&'static str, &'static str)> { FILENAME_MAP .binary_search_by(|&(f, _, _)| f.cmp(name)) .ok() .map(|idx| { let (_, lang, ltype) = FILENAME_MAP[idx]; (lang, ltype) }) } /// Resolve the group name for a language, if any. fn resolve_group(lang: &str) -> Option<&'static str> { LANG_GROUP_MAP .binary_search_by_key(&lang, |&(l, _)| l) .ok() .map(|idx| LANG_GROUP_MAP[idx].1) } /// Detect language for a file path. /// Returns (language_name, lang_type) or None if unknown. fn detect_language(path: &str, is_binary: bool) -> Option<(&'static str, &'static str)> { let file_name = Path::new(path) .file_name() .and_then(|n| n.to_str()) .unwrap_or(""); if let Some(result) = lookup_by_filename(file_name) { tracing::debug!(path = %path, lang = result.0, "matched by filename"); return Some(result); } if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) { let ext_with_dot = format!(".{ext}"); if let Some(result) = lookup_by_extension(&ext_with_dot) { tracing::debug!(path = %path, ext = %ext_with_dot, lang = result.0, "matched by extension"); return Some(result); } tracing::debug!(path = %path, ext = %ext_with_dot, "extension not found in map"); } else { tracing::debug!(path = %path, "no extension found"); } if is_binary { if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) { let ext_lower = format!(".{ext}").to_lowercase(); let media_type = classify_binary_extension(&ext_lower); return Some((media_type, "data")); } return Some(("Binary", "data")); } None } /// Count non-empty lines in data. fn count_code_lines(data: &[u8]) -> u64 { let mut count = 0u64; for line in data.split(|&b| b == b'\n') { if !line.is_empty() && !line.iter().all(|b| b.is_ascii_whitespace()) { count += 1; } } count } /// Aggregated stats per language. #[derive(Default)] struct LangStats { file_count: u64, bytes: u64, lines: u64, lang_type: String, } /// Context passed through recursive tree walking. struct WalkContext<'a> { max_file_size: u32, stats: &'a mut HashMap, total_files: &'a mut u64, total_bytes: &'a mut u64, total_lines: &'a mut u64, } impl GitBare { pub fn get_language_stats( &self, request: GetLanguageStatsRequest, ) -> GitResult { let repo = self.gix_repo()?; let revision = match request.revision.clone().and_then(|s| s.selector) { Some(object_selector::Selector::Oid(oid)) => { crate::sanitize::validate_oid_hex(&oid.hex)?; oid.hex } Some(object_selector::Selector::Revision(name)) => { crate::sanitize::validate_revision(&name.revision)?; name.revision } None => "HEAD".into(), }; let max_file_size = if request.max_file_size == 0 { DEFAULT_MAX_FILE_SIZE } else { request.max_file_size }; let mut tree = repo .rev_parse_single(format!("{}^{{tree}}", revision).as_str())? .object()? .try_into_tree() .map_err(|e| GitError::Gix(e.to_string()))?; if !request.path.is_empty() { crate::sanitize::validate_file_path(&request.path)?; let entry = tree .lookup_entry_by_path(&request.path)? .ok_or_else(|| GitError::NotFound(request.path.clone()))?; tree = entry .object()? .try_into_tree() .map_err(|e| GitError::Gix(e.to_string()))?; } let prefix = request.path.trim_matches('/').to_string(); let mut stats: HashMap = HashMap::new(); let mut total_files = 0u64; let mut total_bytes = 0u64; let mut total_lines = 0u64; let mut ctx = WalkContext { max_file_size, stats: &mut stats, total_files: &mut total_files, total_bytes: &mut total_bytes, total_lines: &mut total_lines, }; self.walk_tree(&repo, &tree, &prefix, 0, &mut ctx)?; tracing::info!( total_files, total_bytes, total_lines, languages_found = stats.len(), "raw language stats before group resolution" ); let mut resolved: HashMap = HashMap::new(); for (lang, s) in stats { let target = resolve_group(&lang).unwrap_or(&lang); let entry = resolved .entry(target.to_string()) .or_insert_with(|| LangStats { lang_type: s.lang_type.clone(), ..Default::default() }); entry.file_count = entry.file_count.saturating_add(s.file_count); entry.bytes = entry.bytes.saturating_add(s.bytes); entry.lines = entry.lines.saturating_add(s.lines); if entry.lang_type.is_empty() { entry.lang_type = s.lang_type; } } let mut languages: Vec = resolved .into_iter() .map(|(language, s)| { let percentage = if total_bytes > 0 { (s.bytes as f64 / total_bytes as f64) * 100.0 } else { 0.0 }; LanguageStat { language, lang_type: s.lang_type, file_count: s.file_count, bytes: s.bytes, lines: s.lines, percentage, } }) .collect(); languages.sort_by(|a, b| { b.bytes .cmp(&a.bytes) .then_with(|| a.language.cmp(&b.language)) }); Ok(GetLanguageStatsResponse { languages, total_files, total_bytes, total_lines, }) } fn walk_tree( &self, _repo: &gix::Repository, tree: &gix::Tree<'_>, prefix: &str, depth: usize, ctx: &mut WalkContext<'_>, ) -> GitResult<()> { if depth > MAX_TREE_WALK_DEPTH { return Err(GitError::InvalidArgument(format!( "tree depth exceeds maximum of {MAX_TREE_WALK_DEPTH}" ))); } for entry in tree.iter() { let entry = entry?; let name = String::from_utf8_lossy(entry.filename()).into_owned(); let path = if prefix.is_empty() { name.clone() } else { format!("{prefix}/{name}") }; match entry.kind() { EntryKind::Tree => { let child_tree = entry .object()? .try_into_tree() .map_err(|e| GitError::Gix(e.to_string()))?; self.walk_tree(_repo, &child_tree, &path, depth + 1, ctx)?; } EntryKind::Blob | EntryKind::BlobExecutable => { let blob = entry .object()? .try_into_blob() .map_err(|e| GitError::Gix(e.to_string()))?; let data = &blob.data; let size = data.len() as u64; if size == 0 { continue; } let is_binary = data.contains(&0); let Some((lang_name, lang_type)) = detect_language(&path, is_binary) else { tracing::debug!(path = %path, is_binary, "no language detected"); continue; }; let lang_key = lang_name.to_string(); let lines = if !is_binary && size <= u64::from(ctx.max_file_size) { count_code_lines(data) } else { 0 }; *ctx.total_files = ctx.total_files.saturating_add(1); *ctx.total_bytes = ctx.total_bytes.saturating_add(size); *ctx.total_lines = ctx.total_lines.saturating_add(lines); let s = ctx .stats .entry(lang_key.clone()) .or_insert_with(|| LangStats { lang_type: lang_type.to_string(), ..Default::default() }); s.file_count = s.file_count.saturating_add(1); s.bytes = s.bytes.saturating_add(size); s.lines = s.lines.saturating_add(lines); } _ => {} // Skip symlinks, submodules } } Ok(()) } }