feat(repository): add language statistics analysis feature

- Remove data directory from gitignore to include language data
- Add build script to parse linguist languages.yml and generate static mappings
- Include serde and serde_yml dependencies for YAML parsing
- Add lang_stats module with language detection and statistics calculation
- Generate protobuf definitions for language statistics API endpoints
- Implement GetLanguageStats RPC endpoint in repository server
- Add comprehensive test suite for language statistics functionality
- Include extension and filename based language detection logic
- Implement binary file classification and group resolution features
This commit is contained in:
zhenyi
2026-06-10 13:06:59 +08:00
parent 9a0c26e5f6
commit 939931acad
10 changed files with 10202 additions and 1 deletions
+25
View File
@@ -385,6 +385,30 @@ message GetRawChangesResponse {
}
message GetLanguageStatsRequest {
RepositoryHeader repository = 1;
ObjectSelector revision = 2; // defaults to HEAD if unset
string path = 3; // optional: restrict to subdirectory
uint32 max_file_size = 4; // skip files larger than this (bytes, 0 = 512KB default)
}
message GetLanguageStatsResponse {
repeated LanguageStat languages = 1;
uint64 total_files = 2;
uint64 total_bytes = 3;
uint64 total_lines = 4;
}
message LanguageStat {
string language = 1; // language name, e.g. "Rust"
string lang_type = 2; // "programming", "markup", "data", "prose"
uint64 file_count = 3;
uint64 bytes = 4;
uint64 lines = 5;
double percentage = 6; // percentage by bytes
}
message FetchRemoteRequest {
RepositoryHeader repository = 1;
string remote_url = 2;
@@ -453,4 +477,5 @@ service RepositoryService {
rpc FindLicense(FindLicenseRequest) returns (FindLicenseResponse);
rpc OptimizeRepository(OptimizeRepositoryRequest) returns (OptimizeRepositoryResponse);
rpc GetRawChanges(GetRawChangesRequest) returns (GetRawChangesResponse);
rpc GetLanguageStats(GetLanguageStatsRequest) returns (GetLanguageStatsResponse);
}