from bs4 import BeautifulSoup import re import os from tqdm import tqdm cwd = os.getcwd() dir = os.path.join(cwd, "teis") outputdir = os.path.join(cwd, "teis_converted") directorylist = os.listdir(dir) def openXML(filepath): with open(filepath, mode='r', encoding='utf-8') as file: content = file.read() file.close() doc = BeautifulSoup(content, 'xml') return doc def writeXML(outputpath, outputdir, doc): if not os.path.isdir(outputdir): os.makedirs(outputdir) with open(outputpath, mode='w', encoding='utf-8') as file: file.write(str(doc)) file.close() def toPlainText(doc): # hand arzt for hand in doc.findAll("add", hand="arzt"): addbrackets(hand, "arzt") # unclear for unclear in doc.findAll("unclear"): addbrackets(unclear, "?") # del for del_elem in doc.findAll("del"): if "hand" in del_elem.attrs: addbrackets(del_elem, '-hand2') else: addbrackets(del_elem, '-') # alle leerzeichen in <w> tags loeschen for w in doc.findAll("w"): w_text = w.text w.clear() w.append(re.sub('\s+', '', w_text)) # leerzeichen zwischen zwei w-tags einfuegen for w in doc.findAll("w"): # sib = w.next_sibling if str(w.next_sibling).startswith('<w'): w.append(' ') # leerzeichen vor linebreak sicherstellen doctext = str(doc) # doctext = re.sub('[^\s]<lb', ' <lb', doctext) doctext = re.sub('<lb', ' <lb', doctext) # leerzeichen nach linebreak loeschen # doctext = re.sub('<lb break=["yes"|"no"]>\s+', '<lb/>', doctext) doctext = BeautifulSoup(doctext, "xml").body.text # leerzeichen vor und hinter eckigen klammern, z.b. Albert [arzt] Abschrift doctext = re.sub('(\s+)(\[)(arzt|\?|\-)(\])(\s+)', '\\2\\3\\4\\5', doctext) # sonderzeichen ersetzen 'm̄' -> mm , 'n̄' -> nn,'−' -> -,'&' -> &, doctext = re.sub('m\u0304', 'mm', doctext) doctext = re.sub('n\u0304', 'nn', doctext) # mehrere leerzeichen zu einem leerzeichen doctext = re.sub('\s+', ' ', doctext) return doctext.strip() def addbrackets(tag, symbol): rep = "[" + symbol + "]" # tagtext = tag.text # tag.clear() # tag.insert(0, rep + tagtext + rep) tag.insert(0, rep) tag.append(rep) for directory in tqdm(directorylist): directorypath = os.path.join(dir, directory) filelist = os.listdir(directorypath) for file in tqdm(filelist): if file == "f": continue filepath = os.path.join(directorypath, file) outputpath = os.path.join(outputdir, directory, file) doc = openXML(filepath) res = toPlainText(doc) writeXML(outputpath, os.path.join(outputdir, directory), res)