files
I'm working on a sizeable (around 1 mil words) collection of texts in a number of github repos. Dealing with them manually would be a chore so I wrote a couple automation scripts.
Remove BOM
MemoQ has a habit of exporting utf-8 plaintext/markdown with BOM, and this is not caught by our course building script and messes up the final html.
Click to see code...
import os
def has_bom(filepath):
with open(filepath, "rb") as file:
bom_check = file.read(3)
return bom_check == b"\xEF\xBB\xBF"
def convert_files(directory):
for root, dirs, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
if has_bom(filepath):
print(filename)
drop_BOM(filepath)
def drop_bom(filepath):
with open(filepath, "r", encoding="utf-8-sig") as file:
content = file.read()
print(f"saved {filepath} without BOM")
with open(filepath, "w", encoding="utf-8") as file:
file.write(content)
# directory containing the files to convert
directory = "../.."
convert_files(directory)
replace local links with absolute urls in markdown
This is useful when pasting a bunch of markdown files to some shared space like google drive (provided a hosted version already exists somewhere). Needs a csv file with matching file names and page names. Handles anchors (or anchor only links) based on pre-defined logic (recently adapted for Confluence).
Click to see code...
import os
import re
import csv
from pathlib import Path
# domain name
DOMAIN = "https://domain-name.com/subpage/"
# Read CSV file, map .md filenames to page names (md_file,page_name)
def load_mappings(csv_file):
mappings = {}
with open(csv_file, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
md_file = row['md_file'].strip().replace(".md", "")
page_name = row['page_name'].strip()
mappings[md_file] = page_name
return mappings
# Format the anchor text according to the rules
def format_anchor(anchor):
# return ''.join(word.capitalize() for word in anchor.split('-')) # CamelCase
return anchor.replace('-', '').capitalize() # Sentencecase
# find md links
MD_LINK_PATTERN = re.compile(r'(\[.*?\])\(([^\)]+)\)')
# Replace links in md files
def replace_links_in_file(md_file, mappings):
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
def replace_match(match):
text, link = match.groups()
if link.startswith('#'):
# Only anchor, prepend the filename of the md file to match from csv
file_name = Path(md_file).stem
parts = ["", link[1:]] # define `parts`
else:
# get filename and anchor from the link
parts = link.split('#')
file_name = Path(parts[0]).stem
anchor = parts[1] if len(parts) > 1 else None
if file_name in mappings:
page_name = mappings[file_name].replace(' ', '+')
new_link = DOMAIN + page_name
if anchor:
new_link += f"#{page_name.replace('+','')}-{format_anchor(anchor)}"
return f"{text}({new_link})"
return match.group(0) # unchanged if no match
updated_content = MD_LINK_PATTERN.sub(replace_match, content)
with open(md_file, 'w', encoding='utf-8') as f:
f.write(updated_content)
# process md files recursively
def process_markdown_files(directory, csv_file):
mappings = load_mappings(csv_file)
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(".md"):
replace_links_in_file(os.path.join(root, file), mappings)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Replace Markdown links with absolute references")
parser.add_argument("directory", help="Directory with markdown files")
parser.add_argument("csv_file", help="CSV with md_file,page_name")
args = parser.parse_args()
process_markdown_files(args.directory, args.csv_file)
update yml (but like it's text)
Why update .yml files but read them as plaintext? Turns out that in my case they don't follow consistent rules when it comes to quoting. So instead of trying to figure out the reasons and/or try to enforce one style I decided to treat them like text files. This also works well because the changes I'm automating in these yamls are minimal: one line (or two) in the file contains some PL text. I need to add an extra line with the EN key and value based on the exiting PL before I can start editing the file manually or deserialize it.
Click to see code...
import csv
import os
def get_glossary(csv_file):
# Read the CSV file with the glossary
csv_mappings = []
with open(csv_file, encoding="utf-8") as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
if len(row) == 2:
old_name, new_name = row
csv_mappings.append((old_name, new_name))
terms_desc = sorted(csv_mappings, key=lambda x: len(x[0]), reverse=True)
return terms_desc
def write_settings(ROOT_DIR, terms_desc):
grading_criteria = set()
translations_pl = set()
for root, dirs, files in os.walk(ROOT_DIR):
for filename in files:
if filename == "settings.yml": # Find the files to edit
file_path = os.path.join(root, filename)
with open(file_path, encoding="utf-8") as f:
content = f.readlines()
new_content = []
for line in content:
if "pl:" in line: # only touch lines with the pl key
line_pl = line
for pl_term, en_term in terms_desc:
if pl_term in line_pl:
line_en = line_pl.replace("pl:", "en:")
line_en = line_en.replace(pl_term, en_term)
line = f"{line_pl.rstrip()}\n{line_en}"
break
new_content.append(line.rstrip())
with open(file_path, "w", encoding="utf-8") as f:
# save the sa
f.write("\n".join(new_content))
f.write("\n")
GLOSSARY = "../path/to/glossary.file"
ROOT_DIR = "../path/to/search"
terms_desc = get_glossary(GLOSSARY)
write_settings(ROOT_DIR, terms_desc)
Of course to make it work I need a glossary. I collect the candidate terms with another script.
Click to see code...
import os
import re
import yaml
def read_settings(ROOT_DIR):
grading_criteria = set()
translations_pl = set()
for root, dirs, files in os.walk(ROOT_DIR):
for filename in files:
if filename == "settings.yml":
file_path = os.path.join(root, filename)
print(file_path)
with open(file_path, encoding="utf-8") as yaml_file:
data = yaml.safe_load(yaml_file)
print(data)
try: # there's an odd irregular file in the repo
for entry in data["grading"]:
grading_criteria.add(entry["criterion"]["pl"])
except Exception as e:
print(f"{e} not found")
translation_pattern = r"(\d+_)?(.+)" # the values start with numbers that don't need changing
try:
for entry in data["translations"]:
pl_value = data["translations"]["pl"]
translation_match = re.match(translation_pattern, pl_value)
if translation_match:
extracted_text = translation_match.group(2)
else:
extracted_text = pl_value
translations_pl.add(extracted_text)
except Exception as e:
print(f"{e} translations")
# if there is no pl value, by default it is based on the name of the parent dir
parent_dir = os.path.basename(os.path.dirname(file_path))
translations_pl.add(parent_dir)
with open(file_path, encoding="utf-8") as f:
old_content = f.read()
new_content = (
old_content.strip() + f"\ntranslations:\n pl: '{parent_dir}'\n"
)
with open(file_path, "w", encoding="utf-8") as f:
f.write(new_content)
return grading_criteria, translations_pl
ROOT_DIR = r"../path/to/search"
grading_criteria, translations_pl = read_settings(ROOT_DIR)
if grading_criteria:
grading_criteria_list = sorted(
list(grading_criteria), key=lambda x: len(x)
) # or reverse=True to get better Translation Memory hits
with open("settings_grading_criteria.txt", "w", encoding="utf-8") as f:
f.writelines("\n".join(grading_criteria_list))
translations_pl_list = sorted(list(translations_pl), key=lambda x: len(x))
with open("settings_translations_pl.txt", "w", encoding="utf-8") as f:
f.writelines("\n".join(translations_pl_list))
The glossary used here is a .csv usually compiled from two files (source and target of translation).
Click to see code...
import csv
# Define the paths to your input text files
file1_path = "../path/to/pl.file"
file2_path = "../path/to/en.file"
# Define the path to the output CSV file
output_csv_path = "../path/to/pl-en.csv"
# Read the contents of the input text files
with open(file1_path, encoding="utf-8") as file1, open(
file2_path, encoding="utf-8"
) as file2:
file1_lines = file1.readlines()
file2_lines = file2.readlines()
# Combine the lines from both files into a list of tuples
combined_lines = [
(line1.strip(), line2.strip()) for line1, line2 in zip(file1_lines, file2_lines)
]
# Write the combined lines to a CSV file
with open(output_csv_path, "w", newline="", encoding="utf-8") as csv_file:
csv_writer = csv.writer(csv_file)
header = ["pl", "en"]
csv_writer.writerow(header)
for row in combined_lines:
csv_writer.writerow(row)
print(
f"Combined {len(combined_lines)} rows from {file1_path} and {file2_path} into {output_csv_path}"
)
Last updated