934858bebf
- Add repo_path parameter to cached_response and cached_vec_response functions - Implement structured cache key format with namespace, repo_path, and request proto - Replace global cache with Moka in-memory cache using weight-based eviction - Set 256MB memory cap with 10-minute TTL and 2-minute TTI policy - Add metrics collection for cache operations and evictions - Implement efficient repo-scoped invalidation using key structure - Add detailed documentation comments explaining cache architecture - Remove outdated dependencies and update dependency versions - Add error handling for encoding failures in cache operations - Optimize Vec responses with length-delimited encoding and pre-allocation
320 lines
11 KiB
Rust
320 lines
11 KiB
Rust
use std::collections::HashMap;
|
|
use std::path::Path;
|
|
|
|
use gix::object::tree::EntryKind;
|
|
|
|
use crate::bare::GitBare;
|
|
use crate::error::{GitError, GitResult};
|
|
use crate::pb::{GetLanguageStatsRequest, GetLanguageStatsResponse, LanguageStat, object_selector};
|
|
|
|
// Include the generated linguist rules
|
|
include!(concat!(env!("OUT_DIR"), "/linguist_generated.rs"));
|
|
|
|
/// Default max file size for line counting (512 KB).
|
|
const DEFAULT_MAX_FILE_SIZE: u32 = 512 * 1024;
|
|
const MAX_TREE_WALK_DEPTH: usize = 256;
|
|
|
|
/// Look up a language by file extension (case-insensitive, includes leading dot).
|
|
fn lookup_by_extension(ext: &str) -> Option<(&'static str, &'static str)> {
|
|
let ext_lower = ext.to_lowercase();
|
|
// Binary search on the sorted EXTENSION_MAP
|
|
EXTENSION_MAP
|
|
.binary_search_by(|&(e, _, _)| e.cmp(ext_lower.as_str()))
|
|
.ok()
|
|
.map(|idx| {
|
|
let (_, lang, ltype) = EXTENSION_MAP[idx];
|
|
(lang, ltype)
|
|
})
|
|
}
|
|
|
|
/// Look up a language by exact filename.
|
|
fn lookup_by_filename(name: &str) -> Option<(&'static str, &'static str)> {
|
|
FILENAME_MAP
|
|
.binary_search_by(|&(f, _, _)| f.cmp(name))
|
|
.ok()
|
|
.map(|idx| {
|
|
let (_, lang, ltype) = FILENAME_MAP[idx];
|
|
(lang, ltype)
|
|
})
|
|
}
|
|
|
|
/// Resolve the group name for a language, if any.
|
|
fn resolve_group(lang: &str) -> Option<&'static str> {
|
|
LANG_GROUP_MAP
|
|
.binary_search_by_key(&lang, |&(l, _)| l)
|
|
.ok()
|
|
.map(|idx| LANG_GROUP_MAP[idx].1)
|
|
}
|
|
|
|
/// Detect language for a file path.
|
|
/// Returns (language_name, lang_type) or None if unknown.
|
|
fn detect_language(path: &str, is_binary: bool) -> Option<(&'static str, &'static str)> {
|
|
let file_name = Path::new(path)
|
|
.file_name()
|
|
.and_then(|n| n.to_str())
|
|
.unwrap_or("");
|
|
|
|
// Try filename match first (e.g., Makefile, Dockerfile)
|
|
if let Some(result) = lookup_by_filename(file_name) {
|
|
tracing::debug!(path = %path, lang = result.0, "matched by filename");
|
|
return Some(result);
|
|
}
|
|
|
|
// Try extension match
|
|
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
|
|
let ext_with_dot = format!(".{ext}");
|
|
if let Some(result) = lookup_by_extension(&ext_with_dot) {
|
|
tracing::debug!(path = %path, ext = %ext_with_dot, lang = result.0, "matched by extension");
|
|
return Some(result);
|
|
}
|
|
tracing::debug!(path = %path, ext = %ext_with_dot, "extension not found in map");
|
|
} else {
|
|
tracing::debug!(path = %path, "no extension found");
|
|
}
|
|
|
|
// For binary files with no recognized extension, classify by media type
|
|
if is_binary {
|
|
// Try extension-based binary classification
|
|
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
|
|
let ext_lower = format!(".{ext}").to_lowercase();
|
|
let media_type = classify_binary_extension(&ext_lower);
|
|
// Return as a synthetic language name
|
|
return Some((media_type, "data"));
|
|
}
|
|
return Some(("Binary", "data"));
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Count non-empty lines in data.
|
|
fn count_code_lines(data: &[u8]) -> u64 {
|
|
let mut count = 0u64;
|
|
for line in data.split(|&b| b == b'\n') {
|
|
if !line.is_empty() && !line.iter().all(|b| b.is_ascii_whitespace()) {
|
|
count += 1;
|
|
}
|
|
}
|
|
count
|
|
}
|
|
|
|
/// Aggregated stats per language.
|
|
#[derive(Default)]
|
|
struct LangStats {
|
|
file_count: u64,
|
|
bytes: u64,
|
|
lines: u64,
|
|
lang_type: String,
|
|
}
|
|
|
|
/// Context passed through recursive tree walking.
|
|
struct WalkContext<'a> {
|
|
max_file_size: u32,
|
|
stats: &'a mut HashMap<String, LangStats>,
|
|
total_files: &'a mut u64,
|
|
total_bytes: &'a mut u64,
|
|
total_lines: &'a mut u64,
|
|
}
|
|
|
|
impl GitBare {
|
|
pub fn get_language_stats(
|
|
&self,
|
|
request: GetLanguageStatsRequest,
|
|
) -> GitResult<GetLanguageStatsResponse> {
|
|
let repo = self.gix_repo()?;
|
|
let revision = match request.revision.clone().and_then(|s| s.selector) {
|
|
Some(object_selector::Selector::Oid(oid)) => {
|
|
crate::sanitize::validate_oid_hex(&oid.hex)?;
|
|
oid.hex
|
|
}
|
|
Some(object_selector::Selector::Revision(name)) => {
|
|
crate::sanitize::validate_revision(&name.revision)?;
|
|
name.revision
|
|
}
|
|
None => "HEAD".into(),
|
|
};
|
|
|
|
let max_file_size = if request.max_file_size == 0 {
|
|
DEFAULT_MAX_FILE_SIZE
|
|
} else {
|
|
request.max_file_size
|
|
};
|
|
|
|
let mut tree = repo
|
|
.rev_parse_single(format!("{}^{{tree}}", revision).as_str())?
|
|
.object()?
|
|
.try_into_tree()
|
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
|
|
|
// If path is specified, descend into subdirectory
|
|
if !request.path.is_empty() {
|
|
crate::sanitize::validate_file_path(&request.path)?;
|
|
let entry = tree
|
|
.lookup_entry_by_path(&request.path)?
|
|
.ok_or_else(|| GitError::NotFound(request.path.clone()))?;
|
|
tree = entry
|
|
.object()?
|
|
.try_into_tree()
|
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
|
}
|
|
|
|
let prefix = request.path.trim_matches('/').to_string();
|
|
let mut stats: HashMap<String, LangStats> = HashMap::new();
|
|
let mut total_files = 0u64;
|
|
let mut total_bytes = 0u64;
|
|
let mut total_lines = 0u64;
|
|
|
|
let mut ctx = WalkContext {
|
|
max_file_size,
|
|
stats: &mut stats,
|
|
total_files: &mut total_files,
|
|
total_bytes: &mut total_bytes,
|
|
total_lines: &mut total_lines,
|
|
};
|
|
self.walk_tree(&repo, &tree, &prefix, 0, &mut ctx)?;
|
|
|
|
// Resolve groups: merge child language stats into parent group
|
|
tracing::info!(
|
|
total_files,
|
|
total_bytes,
|
|
total_lines,
|
|
languages_found = stats.len(),
|
|
"raw language stats before group resolution"
|
|
);
|
|
let mut resolved: HashMap<String, LangStats> = HashMap::new();
|
|
for (lang, s) in stats {
|
|
let target = resolve_group(&lang).unwrap_or(&lang);
|
|
let entry = resolved
|
|
.entry(target.to_string())
|
|
.or_insert_with(|| LangStats {
|
|
lang_type: s.lang_type.clone(),
|
|
..Default::default()
|
|
});
|
|
entry.file_count = entry.file_count.saturating_add(s.file_count);
|
|
entry.bytes = entry.bytes.saturating_add(s.bytes);
|
|
entry.lines = entry.lines.saturating_add(s.lines);
|
|
// Keep the lang_type from the parent (or first encountered)
|
|
if entry.lang_type.is_empty() {
|
|
entry.lang_type = s.lang_type;
|
|
}
|
|
}
|
|
|
|
// Build response sorted by bytes descending
|
|
let mut languages: Vec<LanguageStat> = resolved
|
|
.into_iter()
|
|
.map(|(language, s)| {
|
|
let percentage = if total_bytes > 0 {
|
|
(s.bytes as f64 / total_bytes as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
LanguageStat {
|
|
language,
|
|
lang_type: s.lang_type,
|
|
file_count: s.file_count,
|
|
bytes: s.bytes,
|
|
lines: s.lines,
|
|
percentage,
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
languages.sort_by(|a, b| {
|
|
b.bytes
|
|
.cmp(&a.bytes)
|
|
.then_with(|| a.language.cmp(&b.language))
|
|
});
|
|
|
|
Ok(GetLanguageStatsResponse {
|
|
languages,
|
|
total_files,
|
|
total_bytes,
|
|
total_lines,
|
|
})
|
|
}
|
|
|
|
fn walk_tree(
|
|
&self,
|
|
_repo: &gix::Repository,
|
|
tree: &gix::Tree<'_>,
|
|
prefix: &str,
|
|
depth: usize,
|
|
ctx: &mut WalkContext<'_>,
|
|
) -> GitResult<()> {
|
|
if depth > MAX_TREE_WALK_DEPTH {
|
|
return Err(GitError::InvalidArgument(format!(
|
|
"tree depth exceeds maximum of {MAX_TREE_WALK_DEPTH}"
|
|
)));
|
|
}
|
|
|
|
for entry in tree.iter() {
|
|
let entry = entry?;
|
|
let name = String::from_utf8_lossy(entry.filename()).into_owned();
|
|
let path = if prefix.is_empty() {
|
|
name.clone()
|
|
} else {
|
|
format!("{prefix}/{name}")
|
|
};
|
|
|
|
match entry.kind() {
|
|
EntryKind::Tree => {
|
|
let child_tree = entry
|
|
.object()?
|
|
.try_into_tree()
|
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
|
self.walk_tree(_repo, &child_tree, &path, depth + 1, ctx)?;
|
|
}
|
|
EntryKind::Blob | EntryKind::BlobExecutable => {
|
|
let blob = entry
|
|
.object()?
|
|
.try_into_blob()
|
|
.map_err(|e| GitError::Gix(e.to_string()))?;
|
|
let data = &blob.data;
|
|
let size = data.len() as u64;
|
|
|
|
// Skip empty files
|
|
if size == 0 {
|
|
continue;
|
|
}
|
|
|
|
// Check if binary (contains null byte)
|
|
let is_binary = data.contains(&0);
|
|
|
|
// Detect language
|
|
let Some((lang_name, lang_type)) = detect_language(&path, is_binary) else {
|
|
tracing::debug!(path = %path, is_binary, "no language detected");
|
|
continue;
|
|
};
|
|
|
|
let lang_key = lang_name.to_string();
|
|
|
|
// Count code lines only for non-binary files within size limit
|
|
let lines = if !is_binary && size <= u64::from(ctx.max_file_size) {
|
|
count_code_lines(data)
|
|
} else {
|
|
0
|
|
};
|
|
|
|
*ctx.total_files = ctx.total_files.saturating_add(1);
|
|
*ctx.total_bytes = ctx.total_bytes.saturating_add(size);
|
|
*ctx.total_lines = ctx.total_lines.saturating_add(lines);
|
|
|
|
let s = ctx
|
|
.stats
|
|
.entry(lang_key.clone())
|
|
.or_insert_with(|| LangStats {
|
|
lang_type: lang_type.to_string(),
|
|
..Default::default()
|
|
});
|
|
s.file_count = s.file_count.saturating_add(1);
|
|
s.bytes = s.bytes.saturating_add(size);
|
|
s.lines = s.lines.saturating_add(lines);
|
|
}
|
|
_ => {} // Skip symlinks, submodules
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|