feat(repository): add language statistics analysis feature
- Remove data directory from gitignore to include language data - Add build script to parse linguist languages.yml and generate static mappings - Include serde and serde_yml dependencies for YAML parsing - Add lang_stats module with language detection and statistics calculation - Generate protobuf definitions for language statistics API endpoints - Implement GetLanguageStats RPC endpoint in repository server - Add comprehensive test suite for language statistics functionality - Include extension and filename based language detection logic - Implement binary file classification and group resolution features
This commit is contained in:
@@ -0,0 +1,300 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use gix::object::tree::EntryKind;
|
||||
|
||||
use crate::bare::GitBare;
|
||||
use crate::error::{GitError, GitResult};
|
||||
use crate::pb::{
|
||||
GetLanguageStatsRequest, GetLanguageStatsResponse, LanguageStat, object_selector,
|
||||
};
|
||||
|
||||
// Include the generated linguist rules
|
||||
include!(concat!(env!("OUT_DIR"), "/linguist_generated.rs"));
|
||||
|
||||
/// Default max file size for line counting (512 KB).
|
||||
const DEFAULT_MAX_FILE_SIZE: u32 = 512 * 1024;
|
||||
|
||||
/// Look up a language by file extension (case-insensitive, includes leading dot).
|
||||
fn lookup_by_extension(ext: &str) -> Option<(&'static str, &'static str)> {
|
||||
let ext_lower = ext.to_lowercase();
|
||||
// Binary search on the sorted EXTENSION_MAP
|
||||
EXTENSION_MAP
|
||||
.binary_search_by(|&(e, _, _)| e.cmp(ext_lower.as_str()))
|
||||
.ok()
|
||||
.map(|idx| {
|
||||
let (_, lang, ltype) = EXTENSION_MAP[idx];
|
||||
(lang, ltype)
|
||||
})
|
||||
}
|
||||
|
||||
/// Look up a language by exact filename.
|
||||
fn lookup_by_filename(name: &str) -> Option<(&'static str, &'static str)> {
|
||||
FILENAME_MAP
|
||||
.binary_search_by(|&(f, _, _)| f.cmp(name))
|
||||
.ok()
|
||||
.map(|idx| {
|
||||
let (_, lang, ltype) = FILENAME_MAP[idx];
|
||||
(lang, ltype)
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve the group name for a language, if any.
|
||||
fn resolve_group(lang: &str) -> Option<&'static str> {
|
||||
LANG_GROUP_MAP
|
||||
.binary_search_by_key(&lang, |&(l, _)| l)
|
||||
.ok()
|
||||
.map(|idx| LANG_GROUP_MAP[idx].1)
|
||||
}
|
||||
|
||||
/// Detect language for a file path.
|
||||
/// Returns (language_name, lang_type) or None if unknown.
|
||||
fn detect_language(path: &str, is_binary: bool) -> Option<(&'static str, &'static str)> {
|
||||
let file_name = Path::new(path)
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("");
|
||||
|
||||
// Try filename match first (e.g., Makefile, Dockerfile)
|
||||
if let Some(result) = lookup_by_filename(file_name) {
|
||||
tracing::debug!(path = %path, lang = result.0, "matched by filename");
|
||||
return Some(result);
|
||||
}
|
||||
|
||||
// Try extension match
|
||||
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
|
||||
let ext_with_dot = format!(".{ext}");
|
||||
if let Some(result) = lookup_by_extension(&ext_with_dot) {
|
||||
tracing::debug!(path = %path, ext = %ext_with_dot, lang = result.0, "matched by extension");
|
||||
return Some(result);
|
||||
}
|
||||
tracing::debug!(path = %path, ext = %ext_with_dot, "extension not found in map");
|
||||
} else {
|
||||
tracing::debug!(path = %path, "no extension found");
|
||||
}
|
||||
|
||||
// For binary files with no recognized extension, classify by media type
|
||||
if is_binary {
|
||||
// Try extension-based binary classification
|
||||
if let Some(ext) = Path::new(path).extension().and_then(|e| e.to_str()) {
|
||||
let ext_lower = format!(".{ext}").to_lowercase();
|
||||
let media_type = classify_binary_extension(&ext_lower);
|
||||
// Return as a synthetic language name
|
||||
return Some((media_type, "data"));
|
||||
}
|
||||
return Some(("Binary", "data"));
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Count non-empty lines in data.
|
||||
fn count_code_lines(data: &[u8]) -> u64 {
|
||||
let mut count = 0u64;
|
||||
for line in data.split(|&b| b == b'\n') {
|
||||
if !line.is_empty() && !line.iter().all(|b| b.is_ascii_whitespace()) {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Aggregated stats per language.
|
||||
#[derive(Default)]
|
||||
struct LangStats {
|
||||
file_count: u64,
|
||||
bytes: u64,
|
||||
lines: u64,
|
||||
lang_type: String,
|
||||
}
|
||||
|
||||
/// Context passed through recursive tree walking.
|
||||
struct WalkContext<'a> {
|
||||
max_file_size: u32,
|
||||
stats: &'a mut HashMap<String, LangStats>,
|
||||
total_files: &'a mut u64,
|
||||
total_bytes: &'a mut u64,
|
||||
total_lines: &'a mut u64,
|
||||
}
|
||||
|
||||
impl GitBare {
|
||||
pub fn get_language_stats(
|
||||
&self,
|
||||
request: GetLanguageStatsRequest,
|
||||
) -> GitResult<GetLanguageStatsResponse> {
|
||||
let repo = self.gix_repo()?;
|
||||
let revision = match request.revision.clone().and_then(|s| s.selector) {
|
||||
Some(object_selector::Selector::Oid(oid)) => oid.hex,
|
||||
Some(object_selector::Selector::Revision(name)) => {
|
||||
crate::sanitize::validate_revision(&name.revision)?;
|
||||
name.revision
|
||||
}
|
||||
None => "HEAD".into(),
|
||||
};
|
||||
|
||||
let max_file_size = if request.max_file_size == 0 {
|
||||
DEFAULT_MAX_FILE_SIZE
|
||||
} else {
|
||||
request.max_file_size
|
||||
};
|
||||
|
||||
let mut tree = repo
|
||||
.rev_parse_single(format!("{}^{{tree}}", revision).as_str())?
|
||||
.object()?
|
||||
.try_into_tree()
|
||||
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||
|
||||
// If path is specified, descend into subdirectory
|
||||
if !request.path.is_empty() {
|
||||
let entry = tree
|
||||
.lookup_entry_by_path(&request.path)?
|
||||
.ok_or_else(|| GitError::NotFound(request.path.clone()))?;
|
||||
tree = entry
|
||||
.object()?
|
||||
.try_into_tree()
|
||||
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||
}
|
||||
|
||||
let prefix = request.path.trim_matches('/').to_string();
|
||||
let mut stats: HashMap<String, LangStats> = HashMap::new();
|
||||
let mut total_files = 0u64;
|
||||
let mut total_bytes = 0u64;
|
||||
let mut total_lines = 0u64;
|
||||
|
||||
let mut ctx = WalkContext {
|
||||
max_file_size,
|
||||
stats: &mut stats,
|
||||
total_files: &mut total_files,
|
||||
total_bytes: &mut total_bytes,
|
||||
total_lines: &mut total_lines,
|
||||
};
|
||||
self.walk_tree(&repo, &tree, &prefix, &mut ctx)?;
|
||||
|
||||
// Resolve groups: merge child language stats into parent group
|
||||
tracing::info!(
|
||||
total_files,
|
||||
total_bytes,
|
||||
total_lines,
|
||||
languages_found = stats.len(),
|
||||
"raw language stats before group resolution"
|
||||
);
|
||||
let mut resolved: HashMap<String, LangStats> = HashMap::new();
|
||||
for (lang, s) in stats {
|
||||
let target = resolve_group(&lang).unwrap_or(&lang);
|
||||
let entry = resolved.entry(target.to_string()).or_insert_with(|| LangStats {
|
||||
lang_type: s.lang_type.clone(),
|
||||
..Default::default()
|
||||
});
|
||||
entry.file_count += s.file_count;
|
||||
entry.bytes += s.bytes;
|
||||
entry.lines += s.lines;
|
||||
// Keep the lang_type from the parent (or first encountered)
|
||||
if entry.lang_type.is_empty() {
|
||||
entry.lang_type = s.lang_type;
|
||||
}
|
||||
}
|
||||
|
||||
// Build response sorted by bytes descending
|
||||
let mut languages: Vec<LanguageStat> = resolved
|
||||
.into_iter()
|
||||
.map(|(language, s)| {
|
||||
let percentage = if total_bytes > 0 {
|
||||
(s.bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
LanguageStat {
|
||||
language,
|
||||
lang_type: s.lang_type,
|
||||
file_count: s.file_count,
|
||||
bytes: s.bytes,
|
||||
lines: s.lines,
|
||||
percentage,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
languages.sort_by(|a, b| b.bytes.cmp(&a.bytes).then_with(|| a.language.cmp(&b.language)));
|
||||
|
||||
Ok(GetLanguageStatsResponse {
|
||||
languages,
|
||||
total_files,
|
||||
total_bytes,
|
||||
total_lines,
|
||||
})
|
||||
}
|
||||
|
||||
fn walk_tree(
|
||||
&self,
|
||||
_repo: &gix::Repository,
|
||||
tree: &gix::Tree<'_>,
|
||||
prefix: &str,
|
||||
ctx: &mut WalkContext<'_>,
|
||||
) -> GitResult<()> {
|
||||
for entry in tree.iter() {
|
||||
let entry = entry?;
|
||||
let name = String::from_utf8_lossy(entry.filename()).into_owned();
|
||||
let path = if prefix.is_empty() {
|
||||
name.clone()
|
||||
} else {
|
||||
format!("{prefix}/{name}")
|
||||
};
|
||||
|
||||
match entry.kind() {
|
||||
EntryKind::Tree => {
|
||||
let child_tree = entry
|
||||
.object()?
|
||||
.try_into_tree()
|
||||
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||
self.walk_tree(_repo, &child_tree, &path, ctx)?;
|
||||
}
|
||||
EntryKind::Blob | EntryKind::BlobExecutable => {
|
||||
let blob = entry
|
||||
.object()?
|
||||
.try_into_blob()
|
||||
.map_err(|e| GitError::Gix(e.to_string()))?;
|
||||
let data = &blob.data;
|
||||
let size = data.len() as u64;
|
||||
|
||||
// Skip empty files
|
||||
if size == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if binary (contains null byte)
|
||||
let is_binary = data.contains(&0);
|
||||
|
||||
// Detect language
|
||||
let Some((lang_name, lang_type)) = detect_language(&path, is_binary) else {
|
||||
tracing::debug!(path = %path, is_binary, "no language detected");
|
||||
continue;
|
||||
};
|
||||
|
||||
let lang_key = lang_name.to_string();
|
||||
|
||||
// Count code lines only for non-binary files within size limit
|
||||
let lines = if !is_binary && (size as u32) <= ctx.max_file_size {
|
||||
count_code_lines(data)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
*ctx.total_files += 1;
|
||||
*ctx.total_bytes += size;
|
||||
*ctx.total_lines += lines;
|
||||
|
||||
let s = ctx.stats.entry(lang_key.clone()).or_insert_with(|| LangStats {
|
||||
lang_type: lang_type.to_string(),
|
||||
..Default::default()
|
||||
});
|
||||
s.file_count += 1;
|
||||
s.bytes += size;
|
||||
s.lines += lines;
|
||||
}
|
||||
_ => {} // Skip symlinks, submodules
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
pub mod find_license;
|
||||
pub mod find_merge_base;
|
||||
pub mod lang_stats;
|
||||
pub mod objects_size;
|
||||
pub mod optimize;
|
||||
pub mod raw_changes;
|
||||
|
||||
Reference in New Issue
Block a user