aurak/clean_translations.py


import sys
import re

def clean_translations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split into blocks
    blocks = re.split(r'(\s+\w+: \{)', content)
    # Header is blocks[0]
    # Then blocks[1] is "  zh: {", blocks[2] is content of zh
    # blocks[3] is "  en: {", blocks[4] is content of en
    # blocks[5] is "  ja: {", blocks[6] is content of ja

    header = blocks[0]
    processed_blocks = []

    # Missing keys to ensure (with basic English values)
    missing_keys = [
        "kbSettingsSaved", "failedToSaveSettings", "actionFailed", "userAddedToOrganization",
        "featureUpdated", "roleTenantAdmin", "roleRegularUser", "creatingRegularUser",
        "editUserRole", "targetRole", "editCategory", "totalTenants", "systemUsers",
        "systemHealth", "operational", "orgManagement", "globalTenantControl",
        "newTenant", "domainOptional", "saveChanges", "modelConfiguration",
        "defaultLLMModel", "selectLLM", "selectEmbedding", "rerankModel", "none",
        "indexingChunkingConfig", "chatHyperparameters", "temperature", "precise",
        "creative", "maxResponseTokens", "retrievalSearchSettings", "topK",
        "similarityThreshold", "enableHybridSearch", "hybridSearchDesc", "hybridWeight",
        "pureText", "pureVector", "enableQueryExpansion", "queryExpansionDesc",
        "enableHyDE", "hydeDesc", "enableReranking", "rerankingDesc", "broad",
        "strict", "maxInput", "dimensions", "defaultBadge", "dims", "ctx",
        "baseApi", "configured", "groupUpdated", "groupDeleted", "groupCreated",
        "navCatalog", "allDocuments", "categories", "uncategorizedFiles", "category",
        "statusReadyDesc", "statusIndexingDesc", "selectCategory", "noneUncategorized",
        "previous", "next", "createCategory", "categoryDesc", "categoryName",
        "createCategoryBtn", "newGroup", "noKnowledgeGroups", "createGroupDesc",
        "noDescriptionProvided", "browseManageFiles", "filterGroupFiles"
    ]

    for i in range(1, len(blocks), 2):
        block_header = blocks[i]
        block_content = blocks[i+1]

        # Parse keys and values
        lines = block_content.split('\n')
        keys_seen = set()
        new_lines = []

        # Regex to match "key: value," or "key: `value`,"
        # Support multiline strings too? Let's be careful.
        # Most are single line: "    key: \"value\","

        for line in lines:
            match = re.search(r'^\s+([a-zA-Z0-9_-]+):', line)
            if match:
                key = match.group(1)
                if key in keys_seen:
                    continue # Skip duplicate
                keys_seen.add(key)
            new_lines.append(line)

        # Add missing keys if they are not in keys_seen
        # Remove trailing "  }," or "}," to append
        if new_lines and re.search(r'^\s+},?$', new_lines[-1]):
            last_line = new_lines.pop()
        elif new_lines and re.search(r'^\s+},?$', new_lines[-2]): # Check if last is empty
            last_line = new_lines.pop(-2)
        else:
            last_line = "  },"

        for key in missing_keys:
            if key not in keys_seen:
                # Add a descriptive placeholder or common translation
                val = f'"{key}"' # Default to key name
                new_lines.append(f'    {key}: {val},')

        new_lines.append(last_line)
        processed_blocks.append(block_header + '\n'.join(new_lines))

    new_content = header + ''.join(processed_blocks)

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(new_content)

if __name__ == "__main__":
    clean_translations(sys.argv[1])