files

I'm working on a sizeable (around 1 mil words) collection of texts in a number of github repos. Dealing with them manually would be a chore so I wrote a couple automation scripts.

Remove BOM

MemoQ has a habit of exporting utf-8 plaintext/markdown with BOM, and this is not caught by our course building script and messes up the final html.

Click to see code...

import os


def has_bom(filepath):
    with open(filepath, "rb") as file:
        bom_check = file.read(3)
        return bom_check == b"\xEF\xBB\xBF"


def convert_files(directory):
    for root, dirs, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            if has_bom(filepath):
                print(filename)
                drop_BOM(filepath)


def drop_bom(filepath):
    with open(filepath, "r", encoding="utf-8-sig") as file:
        content = file.read()
        print(f"saved {filepath} without BOM")
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(content)


# directory containing the files to convert
directory = "../.."

convert_files(directory)

replace local links with absolute urls in markdown

This is useful when pasting a bunch of markdown files to some shared space like google drive (provided a hosted version already exists somewhere). Needs a csv file with matching file names and page names. Handles anchors (or anchor only links) based on pre-defined logic (recently adapted for Confluence).

Click to see code...

import os
import re
import csv
from pathlib import Path

# domain name
DOMAIN = "https://domain-name.com/subpage/"

# Read CSV file, map .md filenames to page names (md_file,page_name)
def load_mappings(csv_file):
    mappings = {}
    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            md_file = row['md_file'].strip().replace(".md", "")  
            page_name = row['page_name'].strip()
            mappings[md_file] = page_name
    return mappings

# Format the anchor text according to the rules
def format_anchor(anchor):
    # return ''.join(word.capitalize() for word in anchor.split('-')) # CamelCase
    return anchor.replace('-', '').capitalize() # Sentencecase


# find md links
MD_LINK_PATTERN = re.compile(r'(\[.*?\])\(([^\)]+)\)')

# Replace links in md files
def replace_links_in_file(md_file, mappings):
    with open(md_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    def replace_match(match):
        text, link = match.groups()
        if link.startswith('#'):
            # Only anchor, prepend the filename of the md file to match from csv
            file_name = Path(md_file).stem
            parts = ["", link[1:]]  # define `parts`
        else:
            # get filename and anchor from the link
            parts = link.split('#')
            file_name = Path(parts[0]).stem
        
        anchor = parts[1] if len(parts) > 1 else None
        
        if file_name in mappings:
            page_name = mappings[file_name].replace(' ', '+')
            new_link = DOMAIN + page_name
            if anchor:
                new_link += f"#{page_name.replace('+','')}-{format_anchor(anchor)}"
            return f"{text}({new_link})"
        return match.group(0)  # unchanged if no match
    
    updated_content = MD_LINK_PATTERN.sub(replace_match, content)
    
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(updated_content)

# process md files recursively
def process_markdown_files(directory, csv_file):
    mappings = load_mappings(csv_file)
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                replace_links_in_file(os.path.join(root, file), mappings)

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Replace Markdown links with absolute references")
    parser.add_argument("directory", help="Directory with markdown files")
    parser.add_argument("csv_file", help="CSV with md_file,page_name")
    args = parser.parse_args()
 
    process_markdown_files(args.directory, args.csv_file)

update yml (but like it's text)

Why update .yml files but read them as plaintext? Turns out that in my case they don't follow consistent rules when it comes to quoting. So instead of trying to figure out the reasons and/or try to enforce one style I decided to treat them like text files. This also works well because the changes I'm automating in these yamls are minimal: one line (or two) in the file contains some PL text. I need to add an extra line with the EN key and value based on the exiting PL before I can start editing the file manually or deserialize it.

Click to see code...

import csv
import os


def get_glossary(csv_file):
    # Read the CSV file with the glossary
    csv_mappings = []
    with open(csv_file, encoding="utf-8") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            if len(row) == 2:
                old_name, new_name = row
                csv_mappings.append((old_name, new_name))
    terms_desc = sorted(csv_mappings, key=lambda x: len(x[0]), reverse=True)
    return terms_desc


def write_settings(ROOT_DIR, terms_desc):
    grading_criteria = set()
    translations_pl = set()
    for root, dirs, files in os.walk(ROOT_DIR):
        for filename in files:
            if filename == "settings.yml": # Find the files to edit
                file_path = os.path.join(root, filename)
                with open(file_path, encoding="utf-8") as f:
                    content = f.readlines()
                new_content = []
                for line in content:
                    if "pl:" in line: # only touch lines with the pl key
                        line_pl = line
                        for pl_term, en_term in terms_desc:
                            if pl_term in line_pl:
                                line_en = line_pl.replace("pl:", "en:")
                                line_en = line_en.replace(pl_term, en_term)
                                line = f"{line_pl.rstrip()}\n{line_en}"
                                break
                    new_content.append(line.rstrip())
                with open(file_path, "w", encoding="utf-8") as f:
                    # save the sa
                    f.write("\n".join(new_content))
                    f.write("\n")


GLOSSARY = "../path/to/glossary.file"
ROOT_DIR = "../path/to/search"
terms_desc = get_glossary(GLOSSARY)
write_settings(ROOT_DIR, terms_desc)

Of course to make it work I need a glossary. I collect the candidate terms with another script.

Click to see code...


import os
import re
import yaml



def read_settings(ROOT_DIR):
    grading_criteria = set()
    translations_pl = set()
    for root, dirs, files in os.walk(ROOT_DIR):
        for filename in files:
            if filename == "settings.yml":
                file_path = os.path.join(root, filename)
                print(file_path)
                with open(file_path, encoding="utf-8") as yaml_file:
                    data = yaml.safe_load(yaml_file)
                    print(data)
                try: # there's an odd irregular file in the repo
                    for entry in data["grading"]:
                        grading_criteria.add(entry["criterion"]["pl"])
                except Exception as e:
                    print(f"{e} not found")
                
                translation_pattern = r"(\d+_)?(.+)" # the values start with numbers that don't need changing
                try:
                    for entry in data["translations"]:
                        pl_value = data["translations"]["pl"]
                        translation_match = re.match(translation_pattern, pl_value)
                        if translation_match:
                            extracted_text = translation_match.group(2)
                        else:
                            extracted_text = pl_value
                        translations_pl.add(extracted_text)
                except Exception as e:
                    print(f"{e} translations")
                    # if there is no pl value, by default it is based on the name of the parent dir
                    parent_dir = os.path.basename(os.path.dirname(file_path))
                    translations_pl.add(parent_dir)
                    with open(file_path, encoding="utf-8") as f:
                        old_content = f.read()
                    new_content = (
                        old_content.strip() + f"\ntranslations:\n  pl: '{parent_dir}'\n"
                    )
                    with open(file_path, "w", encoding="utf-8") as f:
                        f.write(new_content)
    return grading_criteria, translations_pl


ROOT_DIR = r"../path/to/search"
grading_criteria, translations_pl = read_settings(ROOT_DIR)

if grading_criteria:
    grading_criteria_list = sorted(
        list(grading_criteria), key=lambda x: len(x)
    )  # or reverse=True to get better Translation Memory hits 
    with open("settings_grading_criteria.txt", "w", encoding="utf-8") as f:
        f.writelines("\n".join(grading_criteria_list))

translations_pl_list = sorted(list(translations_pl), key=lambda x: len(x))
with open("settings_translations_pl.txt", "w", encoding="utf-8") as f:
    f.writelines("\n".join(translations_pl_list))

The glossary used here is a .csv usually compiled from two files (source and target of translation).

Click to see code...

import csv

# Define the paths to your input text files
file1_path = "../path/to/pl.file"
file2_path = "../path/to/en.file"


# Define the path to the output CSV file
output_csv_path = "../path/to/pl-en.csv"

# Read the contents of the input text files
with open(file1_path, encoding="utf-8") as file1, open(
    file2_path, encoding="utf-8"
) as file2:
    file1_lines = file1.readlines()
    file2_lines = file2.readlines()

# Combine the lines from both files into a list of tuples
combined_lines = [
    (line1.strip(), line2.strip()) for line1, line2 in zip(file1_lines, file2_lines)
]

# Write the combined lines to a CSV file
with open(output_csv_path, "w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    header = ["pl", "en"]
    csv_writer.writerow(header)
    for row in combined_lines:
        csv_writer.writerow(row)

print(
    f"Combined {len(combined_lines)} rows from {file1_path} and {file2_path} into {output_csv_path}"
)

Previousgit snippets NextVSCode

Last updated 5 months ago