files

I'm working on a sizeable (around 1 mil words) collection of texts in a number of github repos. Dealing with them manually would be a chore so I wrote a couple automation scripts.

Remove BOM

MemoQ has a habit of exporting utf-8 plaintext/markdown with BOM, and this is not caught by our course building script and messes up the final html.

Click to see code...
import os


def has_bom(filepath):
    with open(filepath, "rb") as file:
        bom_check = file.read(3)
        return bom_check == b"\xEF\xBB\xBF"


def convert_files(directory):
    for root, dirs, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            if has_bom(filepath):
                print(filename)
                drop_BOM(filepath)


def drop_bom(filepath):
    with open(filepath, "r", encoding="utf-8-sig") as file:
        content = file.read()
        print(f"saved {filepath} without BOM")
    with open(filepath, "w", encoding="utf-8") as file:
        file.write(content)


# directory containing the files to convert
directory = "../.."

convert_files(directory)

This is useful when pasting a bunch of markdown files to some shared space like google drive (provided a hosted version already exists somewhere). Needs a csv file with matching file names and page names. Handles anchors (or anchor only links) based on pre-defined logic (recently adapted for Confluence).

Click to see code...
import os
import re
import csv
from pathlib import Path

# domain name
DOMAIN = "https://domain-name.com/subpage/"

# Read CSV file, map .md filenames to page names (md_file,page_name)
def load_mappings(csv_file):
    mappings = {}
    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            md_file = row['md_file'].strip().replace(".md", "")  
            page_name = row['page_name'].strip()
            mappings[md_file] = page_name
    return mappings

# Format the anchor text according to the rules
def format_anchor(anchor):
    # return ''.join(word.capitalize() for word in anchor.split('-')) # CamelCase
    return anchor.replace('-', '').capitalize() # Sentencecase


# find md links
MD_LINK_PATTERN = re.compile(r'(\[.*?\])\(([^\)]+)\)')

# Replace links in md files
def replace_links_in_file(md_file, mappings):
    with open(md_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    def replace_match(match):
        text, link = match.groups()
        if link.startswith('#'):
            # Only anchor, prepend the filename of the md file to match from csv
            file_name = Path(md_file).stem
            parts = ["", link[1:]]  # define `parts`
        else:
            # get filename and anchor from the link
            parts = link.split('#')
            file_name = Path(parts[0]).stem
        
        anchor = parts[1] if len(parts) > 1 else None
        
        if file_name in mappings:
            page_name = mappings[file_name].replace(' ', '+')
            new_link = DOMAIN + page_name
            if anchor:
                new_link += f"#{page_name.replace('+','')}-{format_anchor(anchor)}"
            return f"{text}({new_link})"
        return match.group(0)  # unchanged if no match
    
    updated_content = MD_LINK_PATTERN.sub(replace_match, content)
    
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(updated_content)

# process md files recursively
def process_markdown_files(directory, csv_file):
    mappings = load_mappings(csv_file)
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                replace_links_in_file(os.path.join(root, file), mappings)

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Replace Markdown links with absolute references")
    parser.add_argument("directory", help="Directory with markdown files")
    parser.add_argument("csv_file", help="CSV with md_file,page_name")
    args = parser.parse_args()
 
    process_markdown_files(args.directory, args.csv_file)

update yml (but like it's text)

Why update .yml files but read them as plaintext? Turns out that in my case they don't follow consistent rules when it comes to quoting. So instead of trying to figure out the reasons and/or try to enforce one style I decided to treat them like text files. This also works well because the changes I'm automating in these yamls are minimal: one line (or two) in the file contains some PL text. I need to add an extra line with the EN key and value based on the exiting PL before I can start editing the file manually or deserialize it.

Click to see code...
import csv
import os


def get_glossary(csv_file):
    # Read the CSV file with the glossary
    csv_mappings = []
    with open(csv_file, encoding="utf-8") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            if len(row) == 2:
                old_name, new_name = row
                csv_mappings.append((old_name, new_name))
    terms_desc = sorted(csv_mappings, key=lambda x: len(x[0]), reverse=True)
    return terms_desc


def write_settings(ROOT_DIR, terms_desc):
    grading_criteria = set()
    translations_pl = set()
    for root, dirs, files in os.walk(ROOT_DIR):
        for filename in files:
            if filename == "settings.yml": # Find the files to edit
                file_path = os.path.join(root, filename)
                with open(file_path, encoding="utf-8") as f:
                    content = f.readlines()
                new_content = []
                for line in content:
                    if "pl:" in line: # only touch lines with the pl key
                        line_pl = line
                        for pl_term, en_term in terms_desc:
                            if pl_term in line_pl:
                                line_en = line_pl.replace("pl:", "en:")
                                line_en = line_en.replace(pl_term, en_term)
                                line = f"{line_pl.rstrip()}\n{line_en}"
                                break
                    new_content.append(line.rstrip())
                with open(file_path, "w", encoding="utf-8") as f:
                    # save the sa
                    f.write("\n".join(new_content))
                    f.write("\n")


GLOSSARY = "../path/to/glossary.file"
ROOT_DIR = "../path/to/search"
terms_desc = get_glossary(GLOSSARY)
write_settings(ROOT_DIR, terms_desc)

Of course to make it work I need a glossary. I collect the candidate terms with another script.

Click to see code...

The glossary used here is a .csv usually compiled from two files (source and target of translation).

Click to see code...

Last updated