feat(repository): add language statistics analysis feature
- Remove data directory from gitignore to include language data - Add build script to parse linguist languages.yml and generate static mappings - Include serde and serde_yml dependencies for YAML parsing - Add lang_stats module with language detection and statistics calculation - Generate protobuf definitions for language statistics API endpoints - Implement GetLanguageStats RPC endpoint in repository server - Add comprehensive test suite for language statistics functionality - Include extension and filename based language detection logic - Implement binary file classification and group resolution features
This commit is contained in:
@@ -1,14 +1,18 @@
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
|
||||
let proto_dir = manifest_dir.join("proto");
|
||||
let data_dir = manifest_dir.join("data");
|
||||
let out_dir = PathBuf::from(std::env::var("OUT_DIR")?);
|
||||
|
||||
fs::create_dir_all(&out_dir)?;
|
||||
clean_generated_files(&out_dir)?;
|
||||
|
||||
// Proto compilation
|
||||
let protos = proto_files(&proto_dir)?;
|
||||
for proto in &protos {
|
||||
println!("cargo:rerun-if-changed={}", proto.display());
|
||||
@@ -23,9 +27,242 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
.out_dir(&out_dir)
|
||||
.compile_protos(&protos, &[proto_dir])?;
|
||||
|
||||
// Linguist language stats generation
|
||||
let languages_yml = data_dir.join("languages.yml");
|
||||
println!("cargo:rerun-if-changed={}", languages_yml.display());
|
||||
generate_linguist(&languages_yml, &out_dir)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct LanguageEntry {
|
||||
#[serde(rename = "type")]
|
||||
lang_type: String,
|
||||
#[serde(default)]
|
||||
extensions: Vec<String>,
|
||||
#[serde(default)]
|
||||
filenames: Vec<String>,
|
||||
#[serde(default)]
|
||||
group: Option<String>,
|
||||
}
|
||||
|
||||
fn generate_linguist(
|
||||
languages_yml: &Path,
|
||||
out_dir: &Path,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let yaml_str = fs::read_to_string(languages_yml)?;
|
||||
let languages: HashMap<String, LanguageEntry> = serde_yml::from_str(&yaml_str)?;
|
||||
|
||||
// Build extension → (language, type) mapping
|
||||
// Track primary extensions (first listed for each language) for conflict resolution
|
||||
let mut ext_map: Vec<(String, String, String)> = Vec::new();
|
||||
let mut ext_primary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if primary
|
||||
let mut ext_secondary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if secondary
|
||||
// Build filename → (language, type) mapping
|
||||
let mut fname_map: Vec<(String, String, String)> = Vec::new();
|
||||
let mut fname_primary: HashMap<String, (String, String)> = HashMap::new();
|
||||
let mut fname_secondary: HashMap<String, (String, String)> = HashMap::new();
|
||||
// Build language → type mapping
|
||||
let mut lang_type_map: Vec<(String, String)> = Vec::new();
|
||||
// Build language → group mapping (for resolving group names)
|
||||
let mut lang_group_map: HashMap<String, String> = HashMap::new();
|
||||
|
||||
// Process languages in alphabetical order (deterministic)
|
||||
let mut sorted_langs: Vec<_> = languages.iter().collect();
|
||||
sorted_langs.sort_by(|a, b| a.0.cmp(b.0));
|
||||
|
||||
for (name, entry) in &sorted_langs {
|
||||
let resolved_type = entry.lang_type.clone();
|
||||
lang_type_map.push((name.to_string(), resolved_type.clone()));
|
||||
|
||||
if let Some(ref group) = entry.group {
|
||||
lang_group_map.insert(name.to_string(), group.clone());
|
||||
}
|
||||
|
||||
for (i, ext) in entry.extensions.iter().enumerate() {
|
||||
let ext_lower = ext.to_lowercase();
|
||||
if i == 0 {
|
||||
// Primary extension - always prefer this
|
||||
ext_primary
|
||||
.entry(ext_lower)
|
||||
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||
} else {
|
||||
// Secondary extension - only use if no primary claims it
|
||||
ext_secondary
|
||||
.entry(ext_lower)
|
||||
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
for (i, fname) in entry.filenames.iter().enumerate() {
|
||||
if i == 0 {
|
||||
fname_primary
|
||||
.entry(fname.clone())
|
||||
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||
} else {
|
||||
fname_secondary
|
||||
.entry(fname.clone())
|
||||
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge: primary wins over secondary, with explicit priority for known conflicts
|
||||
// These are common extensions where linguist has multiple primary claims
|
||||
let priority_overrides: HashMap<&str, &str> = [
|
||||
(".rs", "Rust"), // RenderScript also claims .rs
|
||||
(".md", "Markdown"), // GCC Machine Description also claims .md
|
||||
(".r", "R"), // Rebol also claims .r
|
||||
(".s", "Assembly"), // Multiple assemblers claim .s
|
||||
(".ms", "MAXScript"), // Unix Assembly also claims .ms
|
||||
(".g", "G-code"), // GAP also claims .g
|
||||
(".m", "Objective-C"), // Mercury, MUF, etc. also claim .m
|
||||
(".w", "CWeb"), // OpenSCAD also claims .w
|
||||
(".q", "Q"), // KBD also claims .q
|
||||
].iter().cloned().collect();
|
||||
|
||||
for (ext, (lang, ltype)) in ext_primary {
|
||||
if let Some(&preferred) = priority_overrides.get(ext.as_str()) {
|
||||
// Only use this entry if it matches the preferred language
|
||||
if lang == preferred {
|
||||
ext_map.push((ext, lang, ltype));
|
||||
}
|
||||
// Otherwise skip - the preferred language's entry will be added when we process it
|
||||
} else {
|
||||
ext_map.push((ext, lang, ltype));
|
||||
}
|
||||
}
|
||||
// Add preferred languages for any overrides that weren't added yet
|
||||
for (&ext, &preferred) in &priority_overrides {
|
||||
if !ext_map.iter().any(|(e, _, _)| e == ext) {
|
||||
// Find the preferred language's entry
|
||||
if let Some(entry) = languages.get(preferred)
|
||||
&& entry.extensions.iter().any(|e| e.to_lowercase() == ext)
|
||||
{
|
||||
ext_map.push((ext.to_string(), preferred.to_string(), entry.lang_type.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (ext, (lang, ltype)) in ext_secondary {
|
||||
if !ext_map.iter().any(|(e, _, _)| e == &ext) {
|
||||
ext_map.push((ext, lang, ltype));
|
||||
}
|
||||
}
|
||||
|
||||
for (fname, (lang, ltype)) in fname_primary {
|
||||
fname_map.push((fname, lang, ltype));
|
||||
}
|
||||
for (fname, (lang, ltype)) in fname_secondary {
|
||||
if !fname_map.iter().any(|(f, _, _)| f == &fname) {
|
||||
fname_map.push((fname, lang, ltype));
|
||||
}
|
||||
}
|
||||
|
||||
// Sort for deterministic output
|
||||
ext_map.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
fname_map.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
lang_type_map.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
|
||||
let mut code = String::with_capacity(512 * 1024);
|
||||
|
||||
// Extension → (language_name, lang_type) mapping
|
||||
code.push_str("// Auto-generated from linguist languages.yml — do not edit manually.\n\n");
|
||||
code.push_str("/// Extension to (language_name, type) mapping.\n");
|
||||
code.push_str("/// Key is lowercase extension including the dot, e.g. \".rs\".\n");
|
||||
code.push_str("pub static EXTENSION_MAP: &[(&str, &str, &str)] = &[\n");
|
||||
for (ext, lang, ltype) in &ext_map {
|
||||
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
|
||||
escape_str(ext), escape_str(lang), escape_str(ltype)));
|
||||
}
|
||||
code.push_str("];\n\n");
|
||||
|
||||
// Filename → (language_name, lang_type) mapping
|
||||
code.push_str("/// Filename to (language_name, type) mapping.\n");
|
||||
code.push_str("/// Key is exact filename, e.g. \"Makefile\", \"Dockerfile\".\n");
|
||||
code.push_str("pub static FILENAME_MAP: &[(&str, &str, &str)] = &[\n");
|
||||
for (fname, lang, ltype) in &fname_map {
|
||||
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
|
||||
escape_str(fname), escape_str(lang), escape_str(ltype)));
|
||||
}
|
||||
code.push_str("];\n\n");
|
||||
|
||||
// Language name → type mapping
|
||||
code.push_str("/// Language name to type mapping.\n");
|
||||
code.push_str("pub static LANG_TYPE_MAP: &[(&str, &str)] = &[\n");
|
||||
for (lang, ltype) in &lang_type_map {
|
||||
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
|
||||
escape_str(lang), escape_str(ltype)));
|
||||
}
|
||||
code.push_str("];\n\n");
|
||||
|
||||
// Language name → group mapping
|
||||
code.push_str("/// Language name to parent group mapping.\n");
|
||||
code.push_str("pub static LANG_GROUP_MAP: &[(&str, &str)] = &[\n");
|
||||
let mut group_vec: Vec<_> = lang_group_map.iter().collect();
|
||||
group_vec.sort_by(|a, b| a.0.cmp(b.0));
|
||||
for (lang, group) in group_vec {
|
||||
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
|
||||
escape_str(lang), escape_str(group)));
|
||||
}
|
||||
code.push_str("];\n\n");
|
||||
|
||||
// Binary extension classification
|
||||
code.push_str("/// Binary media type classification for extensions.\n");
|
||||
code.push_str("pub fn classify_binary_extension(ext: &str) -> &'static str {\n");
|
||||
code.push_str(" match ext {\n");
|
||||
|
||||
// Image extensions
|
||||
let image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
|
||||
".webp", ".tiff", ".tif", ".psd", ".raw", ".heic", ".heif", ".avif",
|
||||
".apng", ".jfif", ".pjpeg", ".pjp"];
|
||||
for ext in &image_exts {
|
||||
code.push_str(&format!(" \"{}\" => \"Image\",\n", ext));
|
||||
}
|
||||
|
||||
// Video extensions
|
||||
let video_exts = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm",
|
||||
".m4v", ".mpg", ".mpeg", ".3gp", ".3g2", ".ogv", ".vob"];
|
||||
for ext in &video_exts {
|
||||
code.push_str(&format!(" \"{}\" => \"Video\",\n", ext));
|
||||
}
|
||||
|
||||
// Audio extensions
|
||||
let audio_exts = [".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a",
|
||||
".opus", ".aiff", ".ape", ".alac", ".mid", ".midi"];
|
||||
for ext in &audio_exts {
|
||||
code.push_str(&format!(" \"{}\" => \"Audio\",\n", ext));
|
||||
}
|
||||
|
||||
// Font extensions
|
||||
let font_exts = [".ttf", ".otf", ".woff", ".woff2", ".eot"];
|
||||
for ext in &font_exts {
|
||||
code.push_str(&format!(" \"{}\" => \"Font\",\n", ext));
|
||||
}
|
||||
|
||||
// Other binary
|
||||
let binary_exts = [".exe", ".dll", ".so", ".dylib", ".a", ".lib", ".o",
|
||||
".obj", ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".pyc", ".pyo",
|
||||
".class", ".jar", ".war", ".ear", ".zip", ".tar", ".gz",
|
||||
".bz2", ".xz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls",
|
||||
".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".wasm", ".node"];
|
||||
for ext in &binary_exts {
|
||||
code.push_str(&format!(" \"{}\" => \"Binary\",\n", ext));
|
||||
}
|
||||
|
||||
code.push_str(" _ => \"Binary\",\n");
|
||||
code.push_str(" }\n");
|
||||
code.push_str("}\n");
|
||||
|
||||
fs::write(out_dir.join("linguist_generated.rs"), code)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn escape_str(s: &str) -> String {
|
||||
s.replace('\\', "\\\\")
|
||||
.replace('"', "\\\"")
|
||||
}
|
||||
|
||||
fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
|
||||
let mut files = fs::read_dir(proto_dir)?
|
||||
.map(|entry| entry.map(|entry| entry.path()))
|
||||
|
||||
Reference in New Issue
Block a user