added script to extract plain text from tei

41c2d466 · Nina Brolich · 26c751cc · 41c2d466
Commit 41c2d466 authored 2 years ago by Nina Brolich
--- a/plain_text.py
+++ b/plain_text.py
@@ -3,11 +3,6 @@ import re
 import os
 from tqdm import tqdm

-cwd = os.getcwd()
-dir = os.path.join(cwd, "teis")
-outputdir = os.path.join(cwd, "teis_converted")
-directorylist = os.listdir(dir)
-

 def openXML(filepath):
    with open(filepath, mode='r', encoding='utf-8') as file:
@@ -53,10 +48,7 @@ def toPlainText(doc):

    # leerzeichen vor linebreak sicherstellen
    doctext = str(doc)
-    # doctext = re.sub('[^\s]<lb', ' <lb', doctext)
-    doctext = re.sub('<lb', ' <lb', doctext)
-    # leerzeichen nach linebreak loeschen
-    # doctext = re.sub('<lb break=["yes"|"no"]>\s+', '<lb/>', doctext)
+    doctext = re.sub('<lb break=\"yes\"', ' <lb break=\"yes\"', doctext)

    doctext = BeautifulSoup(doctext, "xml").body.text

@@ -75,12 +67,13 @@ def toPlainText(doc):

 def addbrackets(tag, symbol):
    rep = "[" + symbol + "]"
-    # tagtext = tag.text
-    # tag.clear()
-    # tag.insert(0, rep + tagtext + rep)
    tag.insert(0, rep)
    tag.append(rep)

+cwd = os.getcwd()
+dir = os.path.join(cwd, "teis")
+outputdir = os.path.join(cwd, "teis_converted")
+directorylist = os.listdir(dir)

 for directory in tqdm(directorylist):
    directorypath = os.path.join(dir, directory)