Files
gitks/repository/lang_stats.rs
T
zhenyi 934858bebf refactor(cache): redesign cache system with structured keys and improved performance
- Add repo_path parameter to cached_response and cached_vec_response functions
- Implement structured cache key format with namespace, repo_path, and request proto
- Replace global cache with Moka in-memory cache using weight-based eviction
- Set 256MB memory cap with 10-minute TTL and 2-minute TTI policy
- Add metrics collection for cache operations and evictions
- Implement efficient repo-scoped invalidation using key structure
- Add detailed documentation comments explaining cache architecture
- Remove outdated dependencies and update dependency versions
- Add error handling for encoding failures in cache operations
- Optimize Vec responses with length-delimited encoding and pre-allocation
2026-06-12 12:53:23 +08:00

320 lines
11 KiB
Rust

use std::collections::HashMap;
use std::path::Path;
use gix::object::tree::EntryKind;
use crate::bare::GitBare;
use crate::error::{GitError, GitResult};
use crate::pb::{GetLanguageStatsRequest, GetLanguageStatsResponse, LanguageStat, object_selector};
// Include the generated linguist rules
include!(concat!(env!("OUT_DIR"), "/linguist_generated.rs"));
/// Default max file size for line counting (512 KB).
const DEFAULT_MAX_FILE_SIZE: u32 = 512 * 1024;
const MAX_TREE_WALK_DEPTH: usize = 256;
/// Look up a language by file extension (case-insensitive, includes leading dot).
fn lookup_by_extension(ext: &str) -> Option<(&'static str, &'static str)> {
let ext_lower = ext.to_lowercase();
// Binary search on the sorted EXTENSION_MAP
EXTENSION_MAP
.binary_search_by(|&(e, _, _)| e.cmp(ext_lower.as_str()))
.ok()
.map(|idx| {
let (_, lang, ltype) = EXTENSION_MAP[idx];
(lang, ltype)
})
}
/// Look up a language by exact filename.
fn lookup_by_filename(name: &str) -> Option<(&'static str, &'static str)> {
FILENAME_MAP
.binary_search_by(|&(f, _, _)| f.cmp(name))
.ok()
.map(|idx| {
let (_, lang, ltype) = FILENAME_MAP[idx];
(lang, ltype)
})
}
/// Resolve the group name for a language, if any.
fn resolve_group(lang: &str) -> Option<&'static str> {
LANG_GROUP_MAP
.binary_search_by_key(&lang, |&(l, _)| l)
.ok()
.map(|idx| LANG_GROUP_MAP[idx].1)
}
/// Detect language for a file path.
/// Returns (language_name, lang_type) or None if unknown.
fn detect_language(path: &str, is_binary: bool) -> Option<(&'static str, &'static str)> {
let file_name = Path::new(path)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("");
// Try filename match first (e.g., Makefile, Dockerfile)
if let Some(result) = lookup_by_filename(file_name) {
tracing::debug!(path = %path, lang = result.0, "matched by filename");
return Some(result);
}
// Try extension match
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
let ext_with_dot = format!(".{ext}");
if let Some(result) = lookup_by_extension(&ext_with_dot) {
tracing::debug!(path = %path, ext = %ext_with_dot, lang = result.0, "matched by extension");
return Some(result);
}
tracing::debug!(path = %path, ext = %ext_with_dot, "extension not found in map");
} else {
tracing::debug!(path = %path, "no extension found");
}
// For binary files with no recognized extension, classify by media type
if is_binary {
// Try extension-based binary classification
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
let ext_lower = format!(".{ext}").to_lowercase();
let media_type = classify_binary_extension(&ext_lower);
// Return as a synthetic language name
return Some((media_type, "data"));
}
return Some(("Binary", "data"));
}
None
}
/// Count non-empty lines in data.
fn count_code_lines(data: &[u8]) -> u64 {
let mut count = 0u64;
for line in data.split(|&b| b == b'\n') {
if !line.is_empty() && !line.iter().all(|b| b.is_ascii_whitespace()) {
count += 1;
}
}
count
}
/// Aggregated stats per language.
#[derive(Default)]
struct LangStats {
file_count: u64,
bytes: u64,
lines: u64,
lang_type: String,
}
/// Context passed through recursive tree walking.
struct WalkContext<'a> {
max_file_size: u32,
stats: &'a mut HashMap<String, LangStats>,
total_files: &'a mut u64,
total_bytes: &'a mut u64,
total_lines: &'a mut u64,
}
impl GitBare {
pub fn get_language_stats(
&self,
request: GetLanguageStatsRequest,
) -> GitResult<GetLanguageStatsResponse> {
let repo = self.gix_repo()?;
let revision = match request.revision.clone().and_then(|s| s.selector) {
Some(object_selector::Selector::Oid(oid)) => {
crate::sanitize::validate_oid_hex(&oid.hex)?;
oid.hex
}
Some(object_selector::Selector::Revision(name)) => {
crate::sanitize::validate_revision(&name.revision)?;
name.revision
}
None => "HEAD".into(),
};
let max_file_size = if request.max_file_size == 0 {
DEFAULT_MAX_FILE_SIZE
} else {
request.max_file_size
};
let mut tree = repo
.rev_parse_single(format!("{}^{{tree}}", revision).as_str())?
.object()?
.try_into_tree()
.map_err(|e| GitError::Gix(e.to_string()))?;
// If path is specified, descend into subdirectory
if !request.path.is_empty() {
crate::sanitize::validate_file_path(&request.path)?;
let entry = tree
.lookup_entry_by_path(&request.path)?
.ok_or_else(|| GitError::NotFound(request.path.clone()))?;
tree = entry
.object()?
.try_into_tree()
.map_err(|e| GitError::Gix(e.to_string()))?;
}
let prefix = request.path.trim_matches('/').to_string();
let mut stats: HashMap<String, LangStats> = HashMap::new();
let mut total_files = 0u64;
let mut total_bytes = 0u64;
let mut total_lines = 0u64;
let mut ctx = WalkContext {
max_file_size,
stats: &mut stats,
total_files: &mut total_files,
total_bytes: &mut total_bytes,
total_lines: &mut total_lines,
};
self.walk_tree(&repo, &tree, &prefix, 0, &mut ctx)?;
// Resolve groups: merge child language stats into parent group
tracing::info!(
total_files,
total_bytes,
total_lines,
languages_found = stats.len(),
"raw language stats before group resolution"
);
let mut resolved: HashMap<String, LangStats> = HashMap::new();
for (lang, s) in stats {
let target = resolve_group(&lang).unwrap_or(&lang);
let entry = resolved
.entry(target.to_string())
.or_insert_with(|| LangStats {
lang_type: s.lang_type.clone(),
..Default::default()
});
entry.file_count = entry.file_count.saturating_add(s.file_count);
entry.bytes = entry.bytes.saturating_add(s.bytes);
entry.lines = entry.lines.saturating_add(s.lines);
// Keep the lang_type from the parent (or first encountered)
if entry.lang_type.is_empty() {
entry.lang_type = s.lang_type;
}
}
// Build response sorted by bytes descending
let mut languages: Vec<LanguageStat> = resolved
.into_iter()
.map(|(language, s)| {
let percentage = if total_bytes > 0 {
(s.bytes as f64 / total_bytes as f64) * 100.0
} else {
0.0
};
LanguageStat {
language,
lang_type: s.lang_type,
file_count: s.file_count,
bytes: s.bytes,
lines: s.lines,
percentage,
}
})
.collect();
languages.sort_by(|a, b| {
b.bytes
.cmp(&a.bytes)
.then_with(|| a.language.cmp(&b.language))
});
Ok(GetLanguageStatsResponse {
languages,
total_files,
total_bytes,
total_lines,
})
}
fn walk_tree(
&self,
_repo: &gix::Repository,
tree: &gix::Tree<'_>,
prefix: &str,
depth: usize,
ctx: &mut WalkContext<'_>,
) -> GitResult<()> {
if depth > MAX_TREE_WALK_DEPTH {
return Err(GitError::InvalidArgument(format!(
"tree depth exceeds maximum of {MAX_TREE_WALK_DEPTH}"
)));
}
for entry in tree.iter() {
let entry = entry?;
let name = String::from_utf8_lossy(entry.filename()).into_owned();
let path = if prefix.is_empty() {
name.clone()
} else {
format!("{prefix}/{name}")
};
match entry.kind() {
EntryKind::Tree => {
let child_tree = entry
.object()?
.try_into_tree()
.map_err(|e| GitError::Gix(e.to_string()))?;
self.walk_tree(_repo, &child_tree, &path, depth + 1, ctx)?;
}
EntryKind::Blob | EntryKind::BlobExecutable => {
let blob = entry
.object()?
.try_into_blob()
.map_err(|e| GitError::Gix(e.to_string()))?;
let data = &blob.data;
let size = data.len() as u64;
// Skip empty files
if size == 0 {
continue;
}
// Check if binary (contains null byte)
let is_binary = data.contains(&0);
// Detect language
let Some((lang_name, lang_type)) = detect_language(&path, is_binary) else {
tracing::debug!(path = %path, is_binary, "no language detected");
continue;
};
let lang_key = lang_name.to_string();
// Count code lines only for non-binary files within size limit
let lines = if !is_binary && size <= u64::from(ctx.max_file_size) {
count_code_lines(data)
} else {
0
};
*ctx.total_files = ctx.total_files.saturating_add(1);
*ctx.total_bytes = ctx.total_bytes.saturating_add(size);
*ctx.total_lines = ctx.total_lines.saturating_add(lines);
let s = ctx
.stats
.entry(lang_key.clone())
.or_insert_with(|| LangStats {
lang_type: lang_type.to_string(),
..Default::default()
});
s.file_count = s.file_count.saturating_add(1);
s.bytes = s.bytes.saturating_add(size);
s.lines = s.lines.saturating_add(lines);
}
_ => {} // Skip symlinks, submodules
}
}
Ok(())
}
}