Files
gitks/build.rs
T
zhenyi 939931acad feat(repository): add language statistics analysis feature
- Remove data directory from gitignore to include language data
- Add build script to parse linguist languages.yml and generate static mappings
- Include serde and serde_yml dependencies for YAML parsing
- Add lang_stats module with language detection and statistics calculation
- Generate protobuf definitions for language statistics API endpoints
- Implement GetLanguageStats RPC endpoint in repository server
- Add comprehensive test suite for language statistics functionality
- Include extension and filename based language detection logic
- Implement binary file classification and group resolution features
2026-06-10 13:06:59 +08:00

289 lines
11 KiB
Rust

use serde::Deserialize;
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let manifest_dir = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
let proto_dir = manifest_dir.join("proto");
let data_dir = manifest_dir.join("data");
let out_dir = PathBuf::from(std::env::var("OUT_DIR")?);
fs::create_dir_all(&out_dir)?;
clean_generated_files(&out_dir)?;
// Proto compilation
let protos = proto_files(&proto_dir)?;
for proto in &protos {
println!("cargo:rerun-if-changed={}", proto.display());
}
println!("cargo:rerun-if-changed={}", proto_dir.display());
println!("cargo:rerun-if-changed=build.rs");
tonic_prost_build::configure()
.build_client(true)
.build_server(true)
.emit_rerun_if_changed(false)
.out_dir(&out_dir)
.compile_protos(&protos, &[proto_dir])?;
// Linguist language stats generation
let languages_yml = data_dir.join("languages.yml");
println!("cargo:rerun-if-changed={}", languages_yml.display());
generate_linguist(&languages_yml, &out_dir)?;
Ok(())
}
#[derive(Deserialize)]
struct LanguageEntry {
#[serde(rename = "type")]
lang_type: String,
#[serde(default)]
extensions: Vec<String>,
#[serde(default)]
filenames: Vec<String>,
#[serde(default)]
group: Option<String>,
}
fn generate_linguist(
languages_yml: &Path,
out_dir: &Path,
) -> Result<(), Box<dyn std::error::Error>> {
let yaml_str = fs::read_to_string(languages_yml)?;
let languages: HashMap<String, LanguageEntry> = serde_yml::from_str(&yaml_str)?;
// Build extension → (language, type) mapping
// Track primary extensions (first listed for each language) for conflict resolution
let mut ext_map: Vec<(String, String, String)> = Vec::new();
let mut ext_primary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if primary
let mut ext_secondary: HashMap<String, (String, String)> = HashMap::new(); // ext -> (lang, ltype) if secondary
// Build filename → (language, type) mapping
let mut fname_map: Vec<(String, String, String)> = Vec::new();
let mut fname_primary: HashMap<String, (String, String)> = HashMap::new();
let mut fname_secondary: HashMap<String, (String, String)> = HashMap::new();
// Build language → type mapping
let mut lang_type_map: Vec<(String, String)> = Vec::new();
// Build language → group mapping (for resolving group names)
let mut lang_group_map: HashMap<String, String> = HashMap::new();
// Process languages in alphabetical order (deterministic)
let mut sorted_langs: Vec<_> = languages.iter().collect();
sorted_langs.sort_by(|a, b| a.0.cmp(b.0));
for (name, entry) in &sorted_langs {
let resolved_type = entry.lang_type.clone();
lang_type_map.push((name.to_string(), resolved_type.clone()));
if let Some(ref group) = entry.group {
lang_group_map.insert(name.to_string(), group.clone());
}
for (i, ext) in entry.extensions.iter().enumerate() {
let ext_lower = ext.to_lowercase();
if i == 0 {
// Primary extension - always prefer this
ext_primary
.entry(ext_lower)
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
} else {
// Secondary extension - only use if no primary claims it
ext_secondary
.entry(ext_lower)
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
}
}
for (i, fname) in entry.filenames.iter().enumerate() {
if i == 0 {
fname_primary
.entry(fname.clone())
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
} else {
fname_secondary
.entry(fname.clone())
.or_insert_with(|| (name.to_string(), resolved_type.clone()));
}
}
}
// Merge: primary wins over secondary, with explicit priority for known conflicts
// These are common extensions where linguist has multiple primary claims
let priority_overrides: HashMap<&str, &str> = [
(".rs", "Rust"), // RenderScript also claims .rs
(".md", "Markdown"), // GCC Machine Description also claims .md
(".r", "R"), // Rebol also claims .r
(".s", "Assembly"), // Multiple assemblers claim .s
(".ms", "MAXScript"), // Unix Assembly also claims .ms
(".g", "G-code"), // GAP also claims .g
(".m", "Objective-C"), // Mercury, MUF, etc. also claim .m
(".w", "CWeb"), // OpenSCAD also claims .w
(".q", "Q"), // KBD also claims .q
].iter().cloned().collect();
for (ext, (lang, ltype)) in ext_primary {
if let Some(&preferred) = priority_overrides.get(ext.as_str()) {
// Only use this entry if it matches the preferred language
if lang == preferred {
ext_map.push((ext, lang, ltype));
}
// Otherwise skip - the preferred language's entry will be added when we process it
} else {
ext_map.push((ext, lang, ltype));
}
}
// Add preferred languages for any overrides that weren't added yet
for (&ext, &preferred) in &priority_overrides {
if !ext_map.iter().any(|(e, _, _)| e == ext) {
// Find the preferred language's entry
if let Some(entry) = languages.get(preferred)
&& entry.extensions.iter().any(|e| e.to_lowercase() == ext)
{
ext_map.push((ext.to_string(), preferred.to_string(), entry.lang_type.clone()));
}
}
}
for (ext, (lang, ltype)) in ext_secondary {
if !ext_map.iter().any(|(e, _, _)| e == &ext) {
ext_map.push((ext, lang, ltype));
}
}
for (fname, (lang, ltype)) in fname_primary {
fname_map.push((fname, lang, ltype));
}
for (fname, (lang, ltype)) in fname_secondary {
if !fname_map.iter().any(|(f, _, _)| f == &fname) {
fname_map.push((fname, lang, ltype));
}
}
// Sort for deterministic output
ext_map.sort_by(|a, b| a.0.cmp(&b.0));
fname_map.sort_by(|a, b| a.0.cmp(&b.0));
lang_type_map.sort_by(|a, b| a.0.cmp(&b.0));
let mut code = String::with_capacity(512 * 1024);
// Extension → (language_name, lang_type) mapping
code.push_str("// Auto-generated from linguist languages.yml — do not edit manually.\n\n");
code.push_str("/// Extension to (language_name, type) mapping.\n");
code.push_str("/// Key is lowercase extension including the dot, e.g. \".rs\".\n");
code.push_str("pub static EXTENSION_MAP: &[(&str, &str, &str)] = &[\n");
for (ext, lang, ltype) in &ext_map {
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
escape_str(ext), escape_str(lang), escape_str(ltype)));
}
code.push_str("];\n\n");
// Filename → (language_name, lang_type) mapping
code.push_str("/// Filename to (language_name, type) mapping.\n");
code.push_str("/// Key is exact filename, e.g. \"Makefile\", \"Dockerfile\".\n");
code.push_str("pub static FILENAME_MAP: &[(&str, &str, &str)] = &[\n");
for (fname, lang, ltype) in &fname_map {
code.push_str(&format!(" (\"{}\", \"{}\", \"{}\"),\n",
escape_str(fname), escape_str(lang), escape_str(ltype)));
}
code.push_str("];\n\n");
// Language name → type mapping
code.push_str("/// Language name to type mapping.\n");
code.push_str("pub static LANG_TYPE_MAP: &[(&str, &str)] = &[\n");
for (lang, ltype) in &lang_type_map {
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
escape_str(lang), escape_str(ltype)));
}
code.push_str("];\n\n");
// Language name → group mapping
code.push_str("/// Language name to parent group mapping.\n");
code.push_str("pub static LANG_GROUP_MAP: &[(&str, &str)] = &[\n");
let mut group_vec: Vec<_> = lang_group_map.iter().collect();
group_vec.sort_by(|a, b| a.0.cmp(b.0));
for (lang, group) in group_vec {
code.push_str(&format!(" (\"{}\", \"{}\"),\n",
escape_str(lang), escape_str(group)));
}
code.push_str("];\n\n");
// Binary extension classification
code.push_str("/// Binary media type classification for extensions.\n");
code.push_str("pub fn classify_binary_extension(ext: &str) -> &'static str {\n");
code.push_str(" match ext {\n");
// Image extensions
let image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
".webp", ".tiff", ".tif", ".psd", ".raw", ".heic", ".heif", ".avif",
".apng", ".jfif", ".pjpeg", ".pjp"];
for ext in &image_exts {
code.push_str(&format!(" \"{}\" => \"Image\",\n", ext));
}
// Video extensions
let video_exts = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm",
".m4v", ".mpg", ".mpeg", ".3gp", ".3g2", ".ogv", ".vob"];
for ext in &video_exts {
code.push_str(&format!(" \"{}\" => \"Video\",\n", ext));
}
// Audio extensions
let audio_exts = [".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a",
".opus", ".aiff", ".ape", ".alac", ".mid", ".midi"];
for ext in &audio_exts {
code.push_str(&format!(" \"{}\" => \"Audio\",\n", ext));
}
// Font extensions
let font_exts = [".ttf", ".otf", ".woff", ".woff2", ".eot"];
for ext in &font_exts {
code.push_str(&format!(" \"{}\" => \"Font\",\n", ext));
}
// Other binary
let binary_exts = [".exe", ".dll", ".so", ".dylib", ".a", ".lib", ".o",
".obj", ".bin", ".dat", ".db", ".sqlite", ".sqlite3", ".pyc", ".pyo",
".class", ".jar", ".war", ".ear", ".zip", ".tar", ".gz",
".bz2", ".xz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls",
".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".wasm", ".node"];
for ext in &binary_exts {
code.push_str(&format!(" \"{}\" => \"Binary\",\n", ext));
}
code.push_str(" _ => \"Binary\",\n");
code.push_str(" }\n");
code.push_str("}\n");
fs::write(out_dir.join("linguist_generated.rs"), code)?;
Ok(())
}
fn escape_str(s: &str) -> String {
s.replace('\\', "\\\\")
.replace('"', "\\\"")
}
fn proto_files(proto_dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
let mut files = fs::read_dir(proto_dir)?
.map(|entry| entry.map(|entry| entry.path()))
.collect::<Result<Vec<_>, _>>()?;
files.retain(|path| path.extension().is_some_and(|ext| ext == "proto"));
files.sort();
Ok(files)
}
fn clean_generated_files(out_dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
for entry in fs::read_dir(out_dir)? {
let path = entry?.path();
let is_generated_rs = path.extension().is_some_and(|ext| ext == "rs")
&& path.file_name().is_some_and(|name| name != "mod.rs");
if is_generated_rs {
fs::remove_file(path)?;
}
}
Ok(())
}