939931acad
- Remove data directory from gitignore to include language data - Add build script to parse linguist languages.yml and generate static mappings - Include serde and serde_yml dependencies for YAML parsing - Add lang_stats module with language detection and statistics calculation - Generate protobuf definitions for language statistics API endpoints - Implement GetLanguageStats RPC endpoint in repository server - Add comprehensive test suite for language statistics functionality - Include extension and filename based language detection logic - Implement binary file classification and group resolution features
289 lines
11 KiB
Rust
289 lines
11 KiB
Rust
use serde::Deserialize;
|
|
use std::collections::HashMap;
|
|
use std::fs;
|
|
use std::path::{Path, PathBuf};
|
|
|
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
|
|
let proto_dir = manifest_dir.join("proto");
|
|
let data_dir = manifest_dir.join("data");
|
|
let out_dir = PathBuf::from(std::env::var("OUT_DIR")?);
|
|
|
|
fs::create_dir_all(&out_dir)?;
|
|
clean_generated_files(&out_dir)?;
|
|
|
|
// Proto compilation
|
|
let protos = proto_files(&proto_dir)?;
|
|
for proto in &protos {
|
|
println!("cargo:rerun-if-changed={}", proto.display());
|
|
}
|
|
println!("cargo:rerun-if-changed={}", proto_dir.display());
|
|
println!("cargo:rerun-if-changed=build.rs");
|
|
|
|
tonic_prost_build::configure()
|
|
.build_client(true)
|
|
.build_server(true)
|
|
.emit_rerun_if_changed(false)
|
|
.out_dir(&out_dir)
|
|
.compile_protos(&protos, &[proto_dir])?;
|
|
|
|
// Linguist language stats generation
|
|
let languages_yml = data_dir.join("languages.yml");
|
|
println!("cargo:rerun-if-changed={}", languages_yml.display());
|
|
generate_linguist(&languages_yml, &out_dir)?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct LanguageEntry {
|
|
#[serde(rename = "type")]
|
|
lang_type: String,
|
|
#[serde(default)]
|
|
extensions: Vec<String>,
|
|
#[serde(default)]
|
|
filenames: Vec<String>,
|
|
#[serde(default)]
|
|
group: Option<String>,
|
|
}
|
|
|
|
fn generate_linguist(
|
|
languages_yml: &Path,
|
|
out_dir: &Path,
|
|
) -> Result<(), Box<dyn std::error::Error>> {
|
|
let yaml_str = fs::read_to_string(languages_yml)?;
|
|
let languages: HashMap<String, LanguageEntry> = serde_yml::from_str(&yaml_str)?;
|
|
|
|
// Build extension → (language, type) mapping
|
|
// Track primary extensions (first listed for each language) for conflict resolution
|
|
let mut ext_map: Vec<(String, String, String)> = Vec::new();
|
|
let mut ext_primary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if primary
|
|
let mut ext_secondary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if secondary
|
|
// Build filename → (language, type) mapping
|
|
let mut fname_map: Vec<(String, String, String)> = Vec::new();
|
|
let mut fname_primary: HashMap<String, (String, String)> = HashMap::new();
|
|
let mut fname_secondary: HashMap<String, (String, String)> = HashMap::new();
|
|
// Build language → type mapping
|
|
let mut lang_type_map: Vec<(String, String)> = Vec::new();
|
|
// Build language → group mapping (for resolving group names)
|
|
let mut lang_group_map: HashMap<String, String> = HashMap::new();
|
|
|
|
// Process languages in alphabetical order (deterministic)
|
|
let mut sorted_langs: Vec<_> = languages.iter().collect();
|
|
sorted_langs.sort_by(|a, b| a.0.cmp(b.0));
|
|
|
|
for (name, entry) in &sorted_langs {
|
|
let resolved_type = entry.lang_type.clone();
|
|
lang_type_map.push((name.to_string(), resolved_type.clone()));
|
|
|
|
if let Some(ref group) = entry.group {
|
|
lang_group_map.insert(name.to_string(), group.clone());
|
|
}
|
|
|
|
for (i, ext) in entry.extensions.iter().enumerate() {
|
|
let ext_lower = ext.to_lowercase();
|
|
if i == 0 {
|
|
// Primary extension - always prefer this
|
|
ext_primary
|
|
.entry(ext_lower)
|
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
|
} else {
|
|
// Secondary extension - only use if no primary claims it
|
|
ext_secondary
|
|
.entry(ext_lower)
|
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
|
}
|
|
}
|
|
|
|
for (i, fname) in entry.filenames.iter().enumerate() {
|
|
if i == 0 {
|
|
fname_primary
|
|
.entry(fname.clone())
|
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
|
} else {
|
|
fname_secondary
|
|
.entry(fname.clone())
|
|
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Merge: primary wins over secondary, with explicit priority for known conflicts
|
|
// These are common extensions where linguist has multiple primary claims
|
|
let priority_overrides: HashMap<&str, &str> = [
|
|
(".rs", "Rust"), // RenderScript also claims .rs
|
|
(".md", "Markdown"), // GCC Machine Description also claims .md
|
|
(".r", "R"), // Rebol also claims .r
|
|
(".s", "Assembly"), // Multiple assemblers claim .s
|
|
(".ms", "MAXScript"), // Unix Assembly also claims .ms
|
|
(".g", "G-code"), // GAP also claims .g
|
|
(".m", "Objective-C"), // Mercury, MUF, etc. also claim .m
|
|
(".w", "CWeb"), // OpenSCAD also claims .w
|
|
(".q", "Q"), // KBD also claims .q
|
|
].iter().cloned().collect();
|
|
|
|
for (ext, (lang, ltype)) in ext_primary {
|
|
if let Some(&preferred) = priority_overrides.get(ext.as_str()) {
|
|
// Only use this entry if it matches the preferred language
|
|
if lang == preferred {
|
|
ext_map.push((ext, lang, ltype));
|
|
}
|
|
// Otherwise skip - the preferred language's entry will be added when we process it
|
|
} else {
|
|
ext_map.push((ext, lang, ltype));
|
|
}
|
|
}
|
|
// Add preferred languages for any overrides that weren't added yet
|
|
for (&ext, &preferred) in &priority_overrides {
|
|
if !ext_map.iter().any(|(e, _, _)| e == ext) {
|
|
// Find the preferred language's entry
|
|
if let Some(entry) = languages.get(preferred)
|
|
&& entry.extensions.iter().any(|e| e.to_lowercase() == ext)
|
|
{
|
|
ext_map.push((ext.to_string(), preferred.to_string(), entry.lang_type.clone()));
|
|
}
|
|
}
|
|
}
|
|
for (ext, (lang, ltype)) in ext_secondary {
|
|
if !ext_map.iter().any(|(e, _, _)| e == &ext) {
|
|
ext_map.push((ext, lang, ltype));
|
|
}
|
|
}
|
|
|
|
for (fname, (lang, ltype)) in fname_primary {
|
|
fname_map.push((fname, lang, ltype));
|
|
}
|
|
for (fname, (lang, ltype)) in fname_secondary {
|
|
if !fname_map.iter().any(|(f, _, _)| f == &fname) {
|
|
fname_map.push((fname, lang, ltype));
|
|
}
|
|
}
|
|
|
|
// Sort for deterministic output
|
|
ext_map.sort_by(|a, b| a.0.cmp(&b.0));
|
|
fname_map.sort_by(|a, b| a.0.cmp(&b.0));
|
|
lang_type_map.sort_by(|a, b| a.0.cmp(&b.0));
|
|
|
|
let mut code = String::with_capacity(512 * 1024);
|
|
|
|
// Extension → (language_name, lang_type) mapping
|
|
code.push_str("// Auto-generated from linguist languages.yml — do not edit manually.\n\n");
|
|
code.push_str("/// Extension to (language_name, type) mapping.\n");
|
|
code.push_str("/// Key is lowercase extension including the dot, e.g. \".rs\".\n");
|
|
code.push_str("pub static EXTENSION_MAP: &[(&str, &str, &str)] = &[\n");
|
|
for (ext, lang, ltype) in &ext_map {
|
|
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
|
|
escape_str(ext), escape_str(lang), escape_str(ltype)));
|
|
}
|
|
code.push_str("];\n\n");
|
|
|
|
// Filename → (language_name, lang_type) mapping
|
|
code.push_str("/// Filename to (language_name, type) mapping.\n");
|
|
code.push_str("/// Key is exact filename, e.g. \"Makefile\", \"Dockerfile\".\n");
|
|
code.push_str("pub static FILENAME_MAP: &[(&str, &str, &str)] = &[\n");
|
|
for (fname, lang, ltype) in &fname_map {
|
|
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
|
|
escape_str(fname), escape_str(lang), escape_str(ltype)));
|
|
}
|
|
code.push_str("];\n\n");
|
|
|
|
// Language name → type mapping
|
|
code.push_str("/// Language name to type mapping.\n");
|
|
code.push_str("pub static LANG_TYPE_MAP: &[(&str, &str)] = &[\n");
|
|
for (lang, ltype) in &lang_type_map {
|
|
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
|
|
escape_str(lang), escape_str(ltype)));
|
|
}
|
|
code.push_str("];\n\n");
|
|
|
|
// Language name → group mapping
|
|
code.push_str("/// Language name to parent group mapping.\n");
|
|
code.push_str("pub static LANG_GROUP_MAP: &[(&str, &str)] = &[\n");
|
|
let mut group_vec: Vec<_> = lang_group_map.iter().collect();
|
|
group_vec.sort_by(|a, b| a.0.cmp(b.0));
|
|
for (lang, group) in group_vec {
|
|
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
|
|
escape_str(lang), escape_str(group)));
|
|
}
|
|
code.push_str("];\n\n");
|
|
|
|
// Binary extension classification
|
|
code.push_str("/// Binary media type classification for extensions.\n");
|
|
code.push_str("pub fn classify_binary_extension(ext: &str) -> &'static str {\n");
|
|
code.push_str(" match ext {\n");
|
|
|
|
// Image extensions
|
|
let image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
|
|
".webp", ".tiff", ".tif", ".psd", ".raw", ".heic", ".heif", ".avif",
|
|
".apng", ".jfif", ".pjpeg", ".pjp"];
|
|
for ext in &image_exts {
|
|
code.push_str(&format!(" \"{}\" => \"Image\",\n", ext));
|
|
}
|
|
|
|
// Video extensions
|
|
let video_exts = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm",
|
|
".m4v", ".mpg", ".mpeg", ".3gp", ".3g2", ".ogv", ".vob"];
|
|
for ext in &video_exts {
|
|
code.push_str(&format!(" \"{}\" => \"Video\",\n", ext));
|
|
}
|
|
|
|
// Audio extensions
|
|
let audio_exts = [".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a",
|
|
".opus", ".aiff", ".ape", ".alac", ".mid", ".midi"];
|
|
for ext in &audio_exts {
|
|
code.push_str(&format!(" \"{}\" => \"Audio\",\n", ext));
|
|
}
|
|
|
|
// Font extensions
|
|
let font_exts = [".ttf", ".otf", ".woff", ".woff2", ".eot"];
|
|
for ext in &font_exts {
|
|
code.push_str(&format!(" \"{}\" => \"Font\",\n", ext));
|
|
}
|
|
|
|
// Other binary
|
|
let binary_exts = [".exe", ".dll", ".so", ".dylib", ".a", ".lib", ".o",
|
|
".obj", ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".pyc", ".pyo",
|
|
".class", ".jar", ".war", ".ear", ".zip", ".tar", ".gz",
|
|
".bz2", ".xz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls",
|
|
".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".wasm", ".node"];
|
|
for ext in &binary_exts {
|
|
code.push_str(&format!(" \"{}\" => \"Binary\",\n", ext));
|
|
}
|
|
|
|
code.push_str(" _ => \"Binary\",\n");
|
|
code.push_str(" }\n");
|
|
code.push_str("}\n");
|
|
|
|
fs::write(out_dir.join("linguist_generated.rs"), code)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn escape_str(s: &str) -> String {
|
|
s.replace('\\', "\\\\")
|
|
.replace('"', "\\\"")
|
|
}
|
|
|
|
fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
|
|
let mut files = fs::read_dir(proto_dir)?
|
|
.map(|entry| entry.map(|entry| entry.path()))
|
|
.collect::<Result<Vec<_>, _>>()?;
|
|
|
|
files.retain(|path| path.extension().is_some_and(|ext| ext == "proto"));
|
|
files.sort();
|
|
Ok(files)
|
|
}
|
|
|
|
fn clean_generated_files(out_dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
|
|
for entry in fs::read_dir(out_dir)? {
|
|
let path = entry?.path();
|
|
let is_generated_rs = path.extension().is_some_and(|ext| ext == "rs")
|
|
&& path.file_name().is_some_and(|name| name != "mod.rs");
|
|
|
|
if is_generated_rs {
|
|
fs::remove_file(path)?;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|