files

I'm working on a sizeable (around 1 mil words) collection of texts in a number of github repos. Dealing with them manually would be a chore so I wrote a couple automation scripts.

Remove BOM

MemoQ has a habit of exporting utf-8 plaintext/markdown with BOM, and this is not caught by our course building script and messes up the final html.

Click to see code...
import os


def has_bom(filepath):
    with open(filepath, "rb") as file:
        bom_check = file.read(3)
        return bom_check == b"\xEF\xBB\xBF"


def convert_files(directory):
    for root, dirs, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            if has_bom(filepath):
                print(filename)
                drop_BOM(filepath)


def drop_bom(filepath):
    with open(filepath, "r", encoding="utf-8-sig") as file:
        content = file.read()
        print(f"saved {filepath} without BOM")
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(content)


# directory containing the files to convert
directory = "../.."

convert_files(directory)

update yml (but like it's text)

Why update .yml files but read them as plaintext? Turns out that in my case they don't follow consistent rules when it comes to quoting. So instead of trying to figure out the reasons and/or try to enforce one style I decided to treat them like text files. This also works well because the changes I'm automating in these yamls are minimal: one line (or two) in the file contains some PL text. I need to add an extra line with the EN key and value based on the exiting PL before I can start editing the file manually or deserialize it.

Click to see code...
import csv
import os


def get_glossary(csv_file):
    # Read the CSV file with the glossary
    csv_mappings = []
    with open(csv_file, encoding="utf-8") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            if len(row) == 2:
                old_name, new_name = row
                csv_mappings.append((old_name, new_name))
    terms_desc = sorted(csv_mappings, key=lambda x: len(x[0]), reverse=True)
    return terms_desc


def write_settings(ROOT_DIR, terms_desc):
    grading_criteria = set()
    translations_pl = set()
    for root, dirs, files in os.walk(ROOT_DIR):
        for filename in files:
            if filename == "settings.yml": # Find the files to edit
                file_path = os.path.join(root, filename)
                with open(file_path, encoding="utf-8") as f:
                    content = f.readlines()
                new_content = []
                for line in content:
                    if "pl:" in line: # only touch lines with the pl key
                        line_pl = line
                        for pl_term, en_term in terms_desc:
                            if pl_term in line_pl:
                                line_en = line_pl.replace("pl:", "en:")
                                line_en = line_en.replace(pl_term, en_term)
                                line = f"{line_pl.rstrip()}\n{line_en}"
                                break
                    new_content.append(line.rstrip())
                with open(file_path, "w", encoding="utf-8") as f:
                    # save the sa
                    f.write("\n".join(new_content))
                    f.write("\n")


GLOSSARY = "../path/to/glossary.file"
ROOT_DIR = "../path/to/search"
terms_desc = get_glossary(GLOSSARY)
write_settings(ROOT_DIR, terms_desc)

Of course to make it work I need a glossary. I collect the candidate terms with another script.

Click to see code...

import os
import re
import yaml



def read_settings(ROOT_DIR):
    grading_criteria = set()
    translations_pl = set()
    for root, dirs, files in os.walk(ROOT_DIR):
        for filename in files:
            if filename == "settings.yml":
                file_path = os.path.join(root, filename)
                print(file_path)
                with open(file_path, encoding="utf-8") as yaml_file:
                    data = yaml.safe_load(yaml_file)
                    print(data)
                try: # there's an odd irregular file in the repo
                    for entry in data["grading"]:
                        grading_criteria.add(entry["criterion"]["pl"])
                except Exception as e:
                    print(f"{e} not found")
                
                translation_pattern = r"(\d+_)?(.+)" # the values start with numbers that don't need changing
                try:
                    for entry in data["translations"]:
                        pl_value = data["translations"]["pl"]
                        translation_match = re.match(translation_pattern, pl_value)
                        if translation_match:
                            extracted_text = translation_match.group(2)
                        else:
                            extracted_text = pl_value
                        translations_pl.add(extracted_text)
                except Exception as e:
                    print(f"{e} translations")
                    # if there is no pl value, by default it is based on the name of the parent dir
                    parent_dir = os.path.basename(os.path.dirname(file_path))
                    translations_pl.add(parent_dir)
                    with open(file_path, encoding="utf-8") as f:
                        old_content = f.read()
                    new_content = (
                        old_content.strip() + f"\ntranslations:\n  pl: '{parent_dir}'\n"
                    )
                    with open(file_path, "w", encoding="utf-8") as f:
                        f.write(new_content)
    return grading_criteria, translations_pl


ROOT_DIR = r"../path/to/search"
grading_criteria, translations_pl = read_settings(ROOT_DIR)

if grading_criteria:
    grading_criteria_list = sorted(
        list(grading_criteria), key=lambda x: len(x)
    )  # or reverse=True to get better Translation Memory hits 
    with open("settings_grading_criteria.txt", "w", encoding="utf-8") as f:
        f.writelines("\n".join(grading_criteria_list))

translations_pl_list = sorted(list(translations_pl), key=lambda x: len(x))
with open("settings_translations_pl.txt", "w", encoding="utf-8") as f:
    f.writelines("\n".join(translations_pl_list))

The glossary used here is a .csv usually compiled from two files (source and target of translation).

Click to see code...
import csv

# Define the paths to your input text files
file1_path = "../path/to/pl.file"
file2_path = "../path/to/en.file"


# Define the path to the output CSV file
output_csv_path = "../path/to/pl-en.csv"

# Read the contents of the input text files
with open(file1_path, encoding="utf-8") as file1, open(
    file2_path, encoding="utf-8"
) as file2:
    file1_lines = file1.readlines()
    file2_lines = file2.readlines()

# Combine the lines from both files into a list of tuples
combined_lines = [
    (line1.strip(), line2.strip()) for line1, line2 in zip(file1_lines, file2_lines)
]

# Write the combined lines to a CSV file
with open(output_csv_path, "w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    header = ["pl", "en"]
    csv_writer.writerow(header)
    for row in combined_lines:
        csv_writer.writerow(row)

print(
    f"Combined {len(combined_lines)} rows from {file1_path} and {file2_path} into {output_csv_path}"
)

Last updated