added script to extract plain text from tei

26c751cc · Nina Brolich · 4affddd1 · 26c751cc
Commit 26c751cc authored 2 years ago by Nina Brolich
--- a/plain_text.py
+++ b/plain_text.py
@@ -8,6 +8,7 @@ dir = os.path.join(cwd, "teis")
 outputdir = os.path.join(cwd, "teis_converted")
 directorylist = os.listdir(dir)

+
 def openXML(filepath):
    with open(filepath, mode='r', encoding='utf-8') as file:
        content = file.read()
@@ -15,6 +16,7 @@ def openXML(filepath):
    doc = BeautifulSoup(content, 'xml')
    return doc

+
 def writeXML(outputpath, outputdir, doc):
    if not os.path.isdir(outputdir):
        os.makedirs(outputdir)
@@ -22,54 +24,64 @@ def writeXML(outputpath, outputdir, doc):
        file.write(str(doc))
        file.close()

+
 def toPlainText(doc):
-    #hand arzt
+    # hand arzt
    for hand in doc.findAll("add", hand="arzt"):
        addbrackets(hand, "arzt")
-    #unclear
+    # unclear
    for unclear in doc.findAll("unclear"):
        addbrackets(unclear, "?")
-    #del
+    # del
    for del_elem in doc.findAll("del"):
        if "hand" in del_elem.attrs:
            addbrackets(del_elem, '-hand2')
        else:
            addbrackets(del_elem, '-')

-    #alle leerzeichen in <w> tags loeschen
+    # alle leerzeichen in <w> tags loeschen
    for w in doc.findAll("w"):
        w_text = w.text
        w.clear()
        w.append(re.sub('\s+', '', w_text))

-    #leerzeichen zwischen zwei w-tags einfuegen
+    # leerzeichen zwischen zwei w-tags einfuegen
    for w in doc.findAll("w"):
-        #sib = w.next_sibling
+        # sib = w.next_sibling
        if str(w.next_sibling).startswith('<w'):
            w.append(' ')

-    doctext = doc.body.text
-    #sonderzeichen ersetzen 'm&#x0304;' -> mm , 'n&#x0304;' -> nn,'&#8722;' -> -,'&amp;' -> &,
+    # leerzeichen vor linebreak sicherstellen
+    doctext = str(doc)
+    # doctext = re.sub('[^\s]<lb', ' <lb', doctext)
+    doctext = re.sub('<lb', ' <lb', doctext)
+    # leerzeichen nach linebreak loeschen
+    # doctext = re.sub('<lb break=["yes"|"no"]>\s+', '<lb/>', doctext)
+
+    doctext = BeautifulSoup(doctext, "xml").body.text
+
+    # leerzeichen vor und hinter eckigen klammern, z.b. Albert [arzt] Abschrift
+    doctext = re.sub('(\s+)(\[)(arzt|\?|\-)(\])(\s+)', '\\2\\3\\4\\5', doctext)
+
+    # sonderzeichen ersetzen 'm&#x0304;' -> mm , 'n&#x0304;' -> nn,'&#8722;' -> -,'&amp;' -> &,
    doctext = re.sub('m\u0304', 'mm', doctext)
    doctext = re.sub('n\u0304', 'nn', doctext)
-    #doctext = re.sub('\u8722', '-', doctext)
-

-    #mehrere leerzeichen zu einem leerzeichen
+    # mehrere leerzeichen zu einem leerzeichen
    doctext = re.sub('\s+', ' ', doctext)

-    #leerzeichen vor linebreak sicherstellen
-    doctext = re.sub('[^\s]<lb', ' <lb', doctext)
    return doctext.strip()

+
 def addbrackets(tag, symbol):
    rep = "[" + symbol + "]"
-    #tagtext = tag.text
-    #tag.clear()
-    #tag.insert(0, rep + tagtext + rep)
+    # tagtext = tag.text
+    # tag.clear()
+    # tag.insert(0, rep + tagtext + rep)
    tag.insert(0, rep)
    tag.append(rep)

+
 for directory in tqdm(directorylist):
    directorypath = os.path.join(dir, directory)
    filelist = os.listdir(directorypath)
@@ -80,4 +92,4 @@ for directory in tqdm(directorylist):
        outputpath = os.path.join(outputdir, directory, file)
        doc = openXML(filepath)
        res = toPlainText(doc)
-        writeXML(outputpath, os.path.join(outputdir, directory), res)
\ No newline at end of file
+        writeXML(outputpath, os.path.join(outputdir, directory), res)