use serde::Deserialize; use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; fn main() -> Result<(), Box> { let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?); let proto_dir = manifest_dir.join("proto"); let data_dir = manifest_dir.join("data"); let out_dir = PathBuf::from(std::env::var("OUT_DIR")?); fs::create_dir_all(&out_dir)?; clean_generated_files(&out_dir)?; // Proto compilation let protos = proto_files(&proto_dir)?; for proto in &protos { println!("cargo:rerun-if-changed={}", proto.display()); } println!("cargo:rerun-if-changed={}", proto_dir.display()); println!("cargo:rerun-if-changed=build.rs"); tonic_prost_build::configure() .build_client(true) .build_server(true) .emit_rerun_if_changed(false) .out_dir(&out_dir) .compile_protos(&protos, &[proto_dir])?; // Linguist language stats generation let languages_yml = data_dir.join("languages.yml"); println!("cargo:rerun-if-changed={}", languages_yml.display()); generate_linguist(&languages_yml, &out_dir)?; Ok(()) } #[derive(Deserialize)] struct LanguageEntry { #[serde(rename = "type")] lang_type: String, #[serde(default)] extensions: Vec, #[serde(default)] filenames: Vec, #[serde(default)] group: Option, } fn generate_linguist( languages_yml: &Path, out_dir: &Path, ) -> Result<(), Box> { let yaml_str = fs::read_to_string(languages_yml)?; let languages: HashMap = serde_yml::from_str(&yaml_str)?; // Build extension → (language, type) mapping // Track primary extensions (first listed for each language) for conflict resolution let mut ext_map: Vec<(String, String, String)> = Vec::new(); let mut ext_primary: HashMap = HashMap::new(); // ext -> (lang, ltype) if primary let mut ext_secondary: HashMap = HashMap::new(); // ext -> (lang, ltype) if secondary // Build filename → (language, type) mapping let mut fname_map: Vec<(String, String, String)> = Vec::new(); let mut fname_primary: HashMap = HashMap::new(); let mut fname_secondary: HashMap = HashMap::new(); // Build language → type mapping let mut lang_type_map: Vec<(String, String)> = Vec::new(); // Build language → group mapping (for resolving group names) let mut lang_group_map: HashMap = HashMap::new(); // Process languages in alphabetical order (deterministic) let mut sorted_langs: Vec<_> = languages.iter().collect(); sorted_langs.sort_by(|a, b| a.0.cmp(b.0)); for (name, entry) in &sorted_langs { let resolved_type = entry.lang_type.clone(); lang_type_map.push((name.to_string(), resolved_type.clone())); if let Some(ref group) = entry.group { lang_group_map.insert(name.to_string(), group.clone()); } for (i, ext) in entry.extensions.iter().enumerate() { let ext_lower = ext.to_lowercase(); if i == 0 { // Primary extension - always prefer this ext_primary .entry(ext_lower) .or_insert_with(|| (name.to_string(), resolved_type.clone())); } else { // Secondary extension - only use if no primary claims it ext_secondary .entry(ext_lower) .or_insert_with(|| (name.to_string(), resolved_type.clone())); } } for (i, fname) in entry.filenames.iter().enumerate() { if i == 0 { fname_primary .entry(fname.clone()) .or_insert_with(|| (name.to_string(), resolved_type.clone())); } else { fname_secondary .entry(fname.clone()) .or_insert_with(|| (name.to_string(), resolved_type.clone())); } } } // Merge: primary wins over secondary, with explicit priority for known conflicts // These are common extensions where linguist has multiple primary claims let priority_overrides: HashMap<&str, &str> = [ (".rs", "Rust"), // RenderScript also claims .rs (".md", "Markdown"), // GCC Machine Description also claims .md (".r", "R"), // Rebol also claims .r (".s", "Assembly"), // Multiple assemblers claim .s (".ms", "MAXScript"), // Unix Assembly also claims .ms (".g", "G-code"), // GAP also claims .g (".m", "Objective-C"), // Mercury, MUF, etc. also claim .m (".w", "CWeb"), // OpenSCAD also claims .w (".q", "Q"), // KBD also claims .q ].iter().cloned().collect(); for (ext, (lang, ltype)) in ext_primary { if let Some(&preferred) = priority_overrides.get(ext.as_str()) { // Only use this entry if it matches the preferred language if lang == preferred { ext_map.push((ext, lang, ltype)); } // Otherwise skip - the preferred language's entry will be added when we process it } else { ext_map.push((ext, lang, ltype)); } } // Add preferred languages for any overrides that weren't added yet for (&ext, &preferred) in &priority_overrides { if !ext_map.iter().any(|(e, _, _)| e == ext) { // Find the preferred language's entry if let Some(entry) = languages.get(preferred) && entry.extensions.iter().any(|e| e.to_lowercase() == ext) { ext_map.push((ext.to_string(), preferred.to_string(), entry.lang_type.clone())); } } } for (ext, (lang, ltype)) in ext_secondary { if !ext_map.iter().any(|(e, _, _)| e == &ext) { ext_map.push((ext, lang, ltype)); } } for (fname, (lang, ltype)) in fname_primary { fname_map.push((fname, lang, ltype)); } for (fname, (lang, ltype)) in fname_secondary { if !fname_map.iter().any(|(f, _, _)| f == &fname) { fname_map.push((fname, lang, ltype)); } } // Sort for deterministic output ext_map.sort_by(|a, b| a.0.cmp(&b.0)); fname_map.sort_by(|a, b| a.0.cmp(&b.0)); lang_type_map.sort_by(|a, b| a.0.cmp(&b.0)); let mut code = String::with_capacity(512 * 1024); // Extension → (language_name, lang_type) mapping code.push_str("// Auto-generated from linguist languages.yml — do not edit manually.\n\n"); code.push_str("/// Extension to (language_name, type) mapping.\n"); code.push_str("/// Key is lowercase extension including the dot, e.g. \".rs\".\n"); code.push_str("pub static EXTENSION_MAP: &[(&str, &str, &str)] = &[\n"); for (ext, lang, ltype) in &ext_map { code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n", escape_str(ext), escape_str(lang), escape_str(ltype))); } code.push_str("];\n\n"); // Filename → (language_name, lang_type) mapping code.push_str("/// Filename to (language_name, type) mapping.\n"); code.push_str("/// Key is exact filename, e.g. \"Makefile\", \"Dockerfile\".\n"); code.push_str("pub static FILENAME_MAP: &[(&str, &str, &str)] = &[\n"); for (fname, lang, ltype) in &fname_map { code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n", escape_str(fname), escape_str(lang), escape_str(ltype))); } code.push_str("];\n\n"); // Language name → type mapping code.push_str("/// Language name to type mapping.\n"); code.push_str("pub static LANG_TYPE_MAP: &[(&str, &str)] = &[\n"); for (lang, ltype) in &lang_type_map { code.push_str(&format!(" (\"{}\", \"{}\"),\n", escape_str(lang), escape_str(ltype))); } code.push_str("];\n\n"); // Language name → group mapping code.push_str("/// Language name to parent group mapping.\n"); code.push_str("pub static LANG_GROUP_MAP: &[(&str, &str)] = &[\n"); let mut group_vec: Vec<_> = lang_group_map.iter().collect(); group_vec.sort_by(|a, b| a.0.cmp(b.0)); for (lang, group) in group_vec { code.push_str(&format!(" (\"{}\", \"{}\"),\n", escape_str(lang), escape_str(group))); } code.push_str("];\n\n"); // Binary extension classification code.push_str("/// Binary media type classification for extensions.\n"); code.push_str("pub fn classify_binary_extension(ext: &str) -> &'static str {\n"); code.push_str(" match ext {\n"); // Image extensions let image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg", ".webp", ".tiff", ".tif", ".psd", ".raw", ".heic", ".heif", ".avif", ".apng", ".jfif", ".pjpeg", ".pjp"]; for ext in &image_exts { code.push_str(&format!(" \"{}\" => \"Image\",\n", ext)); } // Video extensions let video_exts = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".m4v", ".mpg", ".mpeg", ".3gp", ".3g2", ".ogv", ".vob"]; for ext in &video_exts { code.push_str(&format!(" \"{}\" => \"Video\",\n", ext)); } // Audio extensions let audio_exts = [".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a", ".opus", ".aiff", ".ape", ".alac", ".mid", ".midi"]; for ext in &audio_exts { code.push_str(&format!(" \"{}\" => \"Audio\",\n", ext)); } // Font extensions let font_exts = [".ttf", ".otf", ".woff", ".woff2", ".eot"]; for ext in &font_exts { code.push_str(&format!(" \"{}\" => \"Font\",\n", ext)); } // Other binary let binary_exts = [".exe", ".dll", ".so", ".dylib", ".a", ".lib", ".o", ".obj", ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".pyc", ".pyo", ".class", ".jar", ".war", ".ear", ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".wasm", ".node"]; for ext in &binary_exts { code.push_str(&format!(" \"{}\" => \"Binary\",\n", ext)); } code.push_str(" _ => \"Binary\",\n"); code.push_str(" }\n"); code.push_str("}\n"); fs::write(out_dir.join("linguist_generated.rs"), code)?; Ok(()) } fn escape_str(s: &str) -> String { s.replace('\\', "\\\\") .replace('"', "\\\"") } fn proto_files(proto_dir: &Path) -> Result, Box> { let mut files = fs::read_dir(proto_dir)? .map(|entry| entry.map(|entry| entry.path())) .collect::, _>>()?; files.retain(|path| path.extension().is_some_and(|ext| ext == "proto")); files.sort(); Ok(files) } fn clean_generated_files(out_dir: &Path) -> Result<(), Box> { for entry in fs::read_dir(out_dir)? { let path = entry?.path(); let is_generated_rs = path.extension().is_some_and(|ext| ext == "rs") && path.file_name().is_some_and(|name| name != "mod.rs"); if is_generated_rs { fs::remove_file(path)?; } } Ok(()) }