from bs4 import BeautifulSoup
import re
import os
from tqdm import tqdm

cwd = os.getcwd()
dir = os.path.join(cwd, "teis")
outputdir = os.path.join(cwd, "teis_converted")
directorylist = os.listdir(dir)


def openXML(filepath):
    with open(filepath, mode='r', encoding='utf-8') as file:
        content = file.read()
        file.close()
    doc = BeautifulSoup(content, 'xml')
    return doc


def writeXML(outputpath, outputdir, doc):
    if not os.path.isdir(outputdir):
        os.makedirs(outputdir)
    with open(outputpath, mode='w', encoding='utf-8') as file:
        file.write(str(doc))
        file.close()


def toPlainText(doc):
    # hand arzt
    for hand in doc.findAll("add", hand="arzt"):
        addbrackets(hand, "arzt")
    # unclear
    for unclear in doc.findAll("unclear"):
        addbrackets(unclear, "?")
    # del
    for del_elem in doc.findAll("del"):
        if "hand" in del_elem.attrs:
            addbrackets(del_elem, '-hand2')
        else:
            addbrackets(del_elem, '-')

    # alle leerzeichen in <w> tags loeschen
    for w in doc.findAll("w"):
        w_text = w.text
        w.clear()
        w.append(re.sub('\s+', '', w_text))

    # leerzeichen zwischen zwei w-tags einfuegen
    for w in doc.findAll("w"):
        # sib = w.next_sibling
        if str(w.next_sibling).startswith('<w'):
            w.append(' ')

    # leerzeichen vor linebreak sicherstellen
    doctext = str(doc)
    # doctext = re.sub('[^\s]<lb', ' <lb', doctext)
    doctext = re.sub('<lb', ' <lb', doctext)
    # leerzeichen nach linebreak loeschen
    # doctext = re.sub('<lb break=["yes"|"no"]>\s+', '<lb/>', doctext)

    doctext = BeautifulSoup(doctext, "xml").body.text

    # leerzeichen vor und hinter eckigen klammern, z.b. Albert [arzt] Abschrift
    doctext = re.sub('(\s+)(\[)(arzt|\?|\-)(\])(\s+)', '\\2\\3\\4\\5', doctext)

    # sonderzeichen ersetzen 'm&#x0304;' -> mm , 'n&#x0304;' -> nn,'&#8722;' -> -,'&amp;' -> &,
    doctext = re.sub('m\u0304', 'mm', doctext)
    doctext = re.sub('n\u0304', 'nn', doctext)

    # mehrere leerzeichen zu einem leerzeichen
    doctext = re.sub('\s+', ' ', doctext)

    return doctext.strip()


def addbrackets(tag, symbol):
    rep = "[" + symbol + "]"
    # tagtext = tag.text
    # tag.clear()
    # tag.insert(0, rep + tagtext + rep)
    tag.insert(0, rep)
    tag.append(rep)


for directory in tqdm(directorylist):
    directorypath = os.path.join(dir, directory)
    filelist = os.listdir(directorypath)
    for file in tqdm(filelist):
        if file == "f":
            continue
        filepath = os.path.join(directorypath, file)
        outputpath = os.path.join(outputdir, directory, file)
        doc = openXML(filepath)
        res = toPlainText(doc)
        writeXML(outputpath, os.path.join(outputdir, directory), res)