Skip to content
Snippets Groups Projects
Commit 41c2d466 authored by Nina Brolich's avatar Nina Brolich
Browse files

added script to extract plain text from tei

parent 26c751cc
No related branches found
No related tags found
No related merge requests found
......@@ -3,11 +3,6 @@ import re
import os
from tqdm import tqdm
cwd = os.getcwd()
dir = os.path.join(cwd, "teis")
outputdir = os.path.join(cwd, "teis_converted")
directorylist = os.listdir(dir)
def openXML(filepath):
with open(filepath, mode='r', encoding='utf-8') as file:
......@@ -53,10 +48,7 @@ def toPlainText(doc):
# leerzeichen vor linebreak sicherstellen
doctext = str(doc)
# doctext = re.sub('[^\s]<lb', ' <lb', doctext)
doctext = re.sub('<lb', ' <lb', doctext)
# leerzeichen nach linebreak loeschen
# doctext = re.sub('<lb break=["yes"|"no"]>\s+', '<lb/>', doctext)
doctext = re.sub('<lb break=\"yes\"', ' <lb break=\"yes\"', doctext)
doctext = BeautifulSoup(doctext, "xml").body.text
......@@ -75,12 +67,13 @@ def toPlainText(doc):
def addbrackets(tag, symbol):
rep = "[" + symbol + "]"
# tagtext = tag.text
# tag.clear()
# tag.insert(0, rep + tagtext + rep)
tag.insert(0, rep)
tag.append(rep)
cwd = os.getcwd()
dir = os.path.join(cwd, "teis")
outputdir = os.path.join(cwd, "teis_converted")
directorylist = os.listdir(dir)
for directory in tqdm(directorylist):
directorypath = os.path.join(dir, directory)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment