# -*- coding: utf-8 -*-
import datetime
import logging
import re
import uuid
from io import StringIO
import unidecode
from lxml import etree
from rdflib.namespace import XSD
from rdflib.term import Literal
import syntok.segmenter as segmenter
[docs]def to_iri(s: str = ""):
return (
s.replace('"', "%22")
.replace("µ", "mu")
.replace("ª", "_")
.replace("º", "_")
.replace("'", "%27")
.replace(">", "")
.replace("<", "")
)
[docs]def tokenize_text(text: list = None, forced_sentence_split_characters: list = []):
""" """
tokenized_text = tokenizer(text)
tokenized_new = []
for sentence in tokenized_text:
tok_sent = []
for token in sentence:
if token["text"] in forced_sentence_split_characters:
if tok_sent != []:
tokenized_new.append(tok_sent)
tok_sent = []
tok_sent.append(token)
tokenized_new.append(tok_sent)
tokenized_text = tokenized_new
# delete empty tokens
if tokenized_text != []:
tokenized_text = [
sentence if sentence[-1]["text"] != "" else sentence[:-1]
for sentence in tokenized_text
]
return tokenized_text
[docs]def replace_escape_characters(text: str = None):
"""
Function to replace espace characters by spaces (maintaining exact character locations)
:param text: the text where escape characters should be replaces
"""
escape_characters = [
"\a", # bell
"\b", # back space
"\t", # tab
"\n", # new line
"\v", # vertical tab
"\f", # form feed
"\r", # carriage return
]
escape_character_table = {
ord(escape_character): " " for escape_character in escape_characters
}
return text.translate(escape_character_table)
[docs]def tokenizer(text: str = None):
"""
Function to create list of sentences with list of words
with text and start_char and end_char of each word
:param text: the text to be tokenized
"""
sentences = list()
for paragraph in segmenter.analyze(text):
for sentence in paragraph:
words = list()
for token in sentence:
value = text[token.offset : token.offset + len(token.value)]
if value != token.value:
logging.error("Error: incorrect offsets in syntok.segmenter.")
else:
words.append(
{
"text": token.value,
"start_char": token.offset,
"end_char": token.offset + len(token.value),
}
)
sentences.append(words)
return sentences
[docs]def align_stanza_dict_offsets(stanza_dict: list = None, sentences: list = None):
"""
Function to align the stanza dict offsets with the offsets from the tokenizer
:param stanza_dict: the output dict from the Stanza pipeline
:param sentences: the output of the tokenizer
"""
# check alignment of stanza_dict and tokenized_document
assert len(stanza_dict) == len(sentences)
for sent_idx, sent in enumerate(stanza_dict):
assert len(stanza_dict[sent_idx]) == len(sentences[sent_idx])
# correct stanza_dict start_char and end_char
for sent_idx, sent in enumerate(stanza_dict):
for word_idx, word in enumerate(sent):
word["start_char"] = sentences[sent_idx][word_idx]["start_char"]
word["end_char"] = sentences[sent_idx][word_idx]["end_char"]
return stanza_dict
[docs]def generate_uuid(uri: str = None, prefix: str = "nif-"):
"""
Function to generate the uuid for nif
:param uri: the uri from which the uuid should be generated
:param prefix: the prefix of the uuid, default = "nif-"
"""
return prefix + uuid.uuid3(uuid.NAMESPACE_DNS, uri).hex
[docs]def natural_sort(elements: list = None):
"""
Function to sort a list of strings with numbers
:param elements: the list to be sorted
"""
def convert_to_int(text):
return int(text) if text.isdigit() else text.lower()
def alphanum_key(key):
return [convert_to_int(c) for c in re.split("([0-9]+)", key)]
return sorted(elements, key=alphanum_key)
[docs]def delete_accents(text: str = None, lang: str = "en"):
"""
Function to delete accents from a string
:param text: the string from which the accents should be deleted
:param lang: the language of the text in the string
"""
if lang == Literal("grc", datatype=XSD.string):
replacements = {
"ἒ": "ἐ",
"ἓ": "ἑ",
"ἔ": "ἐ",
"ἕ": "ἑ",
"έ": "ε",
"ὲ": "ε",
"έ": "ε",
"ἂ": "ἀ",
"ἃ": "ἁ",
"ἄ": "ἀ",
"ἅ": "ἁ",
"ά": "α",
"ὰ": "α",
"ά": "α",
"ᾂ": "ᾀ",
"ᾄ": "ᾀ",
"ᾃ": "ᾁ",
"ᾅ": "ᾁ",
"ᾲ": "ᾳ",
"ᾴ": "ᾳ",
"ί": "ι",
"ἲ": "ἰ",
"ἳ": "ἱ",
"ἴ": "ἰ",
"ἵ": "ἱ",
"ῒ": "ϊ",
"ΐ": "ϊ",
"ὶ": "ι",
"ί": "ι",
"ή": "η",
"ἢ": "ἠ",
"ἣ": "ἡ",
"ἤ": "ἠ",
"ἥ": "ἡ",
"ὴ": "η",
"ή": "η",
"ΰ": "ϋ",
"ύ": "υ",
"ὒ": "ὐ",
"ὓ": "ὑ",
"ὔ": "ὐ",
"ὕ": "ὑ",
"ὺ": "υ",
"ύ": "υ",
"ῢ": "ϋ",
"ΰ": "ϋ",
"ὢ": "ὠ",
"ὣ": "ὡ",
"ὤ": "ὠ",
"ὥ": "ὡ",
"ὼ": "ω",
"ώ": "ω",
"ό": "ο",
"ὂ": "ὀ",
"ὃ": "ὁ",
"ὄ": "ὀ",
"ὅ": "ὁ",
"ὸ": "ο",
"ό": "ο",
"ᾢ": "ᾠ",
"ᾣ": "ᾡ",
"ᾤ": "ᾠ",
"ᾥ": "ᾡ",
"ῲ": "ῳ",
"ῴ": "ῳ",
}
for replacement in replacements.keys():
text = text.replace(replacement, replacements[replacement])
else:
text = unidecode.unidecode(text)
return text
[docs]def delete_diacritics(text: str = None, lang: str = "en"):
"""
Function to delete diacritics from a string
:param text: the string from which the diacritics should be deleted
:param lang: the language of the text in the string
"""
if lang == Literal("grc", datatype=XSD.string):
replacements = {
"Ά": "Α",
"Ᾰ": "Α",
"Ᾱ": "Α",
"Ὰ": "Α",
"Ά": "Α",
"Έ": "Ε",
"Ὲ": "Ε",
"Έ": "Ε",
"Ή": "Η",
"Ὴ": "Η",
"Ή": "Η",
"Ί": "Ι",
"Ϊ": "Ι",
"Ό": "Ο",
"Ὸ": "Ο",
"Ό": "Ο",
"Ύ": "Υ",
"Ϋ": "Υ",
"Ώ": "Ω",
"ϓ": "ϒ",
"ϔ": "ϒ",
"Ὑ": "ϒ",
"Ὓ": "ϒ",
"Ὕ": "ϒ",
"Ὗ": "ϒ",
"Ῠ": "ϒ",
"Ῡ": "ϒ",
"Ὺ": "ϒ",
"Ύ": "ϒ",
"ἀ": "α",
"ἁ": "α",
"ἂ": "α",
"ἃ": "α",
"ἄ": "α",
"ἅ": "α",
"ἆ": "α",
"ἇ": "α",
"ά": "α",
"ὰ": "α",
"ά": "α",
"ᾰ": "α",
"ᾱ": "α",
"ᾶ": "α",
"Ἀ": "Α",
"Ἁ": "Α",
"Ἂ": "Α",
"Ἃ": "Α",
"Ἄ": "Α",
"Ἅ": "Α",
"Ἆ": "Α",
"Ἇ": "Α",
"ἐ": "ε",
"ἑ": "ε",
"ἒ": "ε",
"ἓ": "ε",
"ἔ": "ε",
"ἕ": "ε",
"έ": "ε",
"ὲ": "ε",
"έ": "ε",
"Ἐ": "Ε",
"Ἑ": "Ε",
"Ἒ": "Ε",
"Ἓ": "Ε",
"Ἔ": "Ε",
"Ἕ": "Ε",
"ἠ": "η",
"ἡ": "η",
"ἢ": "η",
"ἣ": "η",
"ἤ": "η",
"ἥ": "η",
"ἦ": "η",
"ἧ": "η",
"ή": "η",
"ὴ": "η",
"ή": "η",
"ῆ": "η",
"Ἠ": "Η",
"Ἡ": "Η",
"Ἢ": "Η",
"Ἣ": "Η",
"Ἤ": "Η",
"Ἥ": "Η",
"Ἦ": "Η",
"Ἧ": "Η",
"ἰ": "ι",
"ἱ": "ι",
"ἲ": "ι",
"ἳ": "ι",
"ἴ": "ι",
"ἵ": "ι",
"ἶ": "ι",
"ἷ": "ι",
"ΐ": "ι",
"ϊ": "ι",
"ί": "ι",
"ὶ": "ι",
"ί": "ι",
"ῐ": "ι",
"ῑ": "ι",
"ῒ": "ι",
"ΐ": "ι",
"ῖ": "ι",
"ῗ": "ι",
"ΰ": "υ",
"ϋ": "υ",
"ύ": "υ",
"ὐ": "υ",
"ὑ": "υ",
"ὒ": "υ",
"ὓ": "υ",
"ὔ": "υ",
"ὕ": "υ",
"ὖ": "υ",
"ὗ": "υ",
"ὺ": "υ",
"ύ": "υ",
"ῠ": "υ",
"ῡ": "υ",
"ῢ": "υ",
"ΰ": "υ",
"ῦ": "υ",
"ῧ": "υ",
"ό": "ο",
"ὀ": "ο",
"ὁ": "ο",
"ὂ": "ο",
"ὃ": "ο",
"ὄ": "ο",
"ὅ": "ο",
"ὸ": "ο",
"ό": "ο",
"ώ": "ω",
"ὠ": "ω",
"ὡ": "ω",
"ὢ": "ω",
"ὣ": "ω",
"ὤ": "ω",
"ὥ": "ω",
"ὦ": "ω",
"ὧ": "ω",
"ὼ": "ω",
"ώ": "ω",
"ῶ": "ω",
"Ἰ": "Ι",
"Ἱ": "Ι",
"Ἲ": "Ι",
"Ἳ": "Ι",
"Ἴ": "Ι",
"Ἵ": "Ι",
"Ἶ": "Ι",
"Ἷ": "Ι",
"Ῐ": "Ι",
"Ῑ": "Ι",
"Ὶ": "Ι",
"Ί": "Ι",
"Ὀ": "Ο",
"Ὁ": "Ο",
"Ὂ": "Ο",
"Ὃ": "Ο",
"Ὄ": "Ο",
"Ὅ": "Ο",
"Ὠ": "Ω",
"Ὡ": "Ω",
"Ὢ": "Ω",
"Ὣ": "Ω",
"Ὤ": "Ω",
"Ὥ": "Ω",
"Ὦ": "Ω",
"Ὧ": "Ω",
"ᾀ": "ᾳ",
"ᾁ": "ᾳ",
"ᾂ": "ᾳ",
"ᾃ": "ᾳ",
"ᾄ": "ᾳ",
"ᾅ": "ᾳ",
"ᾆ": "ᾳ",
"ᾇ": "ᾳ",
"ᾲ": "ᾳ",
"ᾴ": "ᾳ",
"ᾷ": "ᾳ",
"ᾈ": "ᾼ",
"ᾉ": "ᾼ",
"ᾊ": "ᾼ",
"ᾋ": "ᾼ",
"ᾌ": "ᾼ",
"ᾍ": "ᾼ",
"ᾎ": "ᾼ",
"ᾏ": "ᾼ",
"ᾐ": "ῃ",
"ᾑ": "ῃ",
"ᾒ": "ῃ",
"ᾓ": "ῃ",
"ᾔ": "ῃ",
"ᾕ": "ῃ",
"ᾖ": "ῃ",
"ᾗ": "ῃ",
"ῂ": "ῃ",
"ῄ": "ῃ",
"ῇ": "ῃ",
"ᾘ": "ῌ",
"ᾙ": "ῌ",
"ᾚ": "ῌ",
"ᾛ": "ῌ",
"ᾜ": "ῌ",
"ᾝ": "ῌ",
"ᾞ": "ῌ",
"ᾟ": "ῌ",
"ᾠ": "ῳ",
"ᾡ": "ῳ",
"ᾢ": "ῳ",
"ᾣ": "ῳ",
"ᾤ": "ῳ",
"ᾥ": "ῳ",
"ᾦ": "ῳ",
"ᾧ": "ῳ",
"ῲ": "ῳ",
"ῴ": "ῳ",
"ῷ": "ῳ",
"ᾨ": "ῼ",
"ᾩ": "ῼ",
"ᾪ": "ῼ",
"ᾫ": "ῼ",
"ᾬ": "ῼ",
"ᾭ": "ῼ",
"ᾮ": "ῼ",
"ᾯ": "ῼ",
"ῤ": "ρ",
"ῥ": "ρ",
"Ῥ": "Ρ",
"Ὼ": "Ω",
"Ώ": "Ω",
"ꭥ": "Ω",
}
for replacement in replacements.keys():
text = text.replace(replacement, replacements[replacement])
else:
text = unidecode.unidecode(text)
return text
[docs]def load_dtd(dtd_url: str) -> etree.DTD:
"""Utility function to load the dtd
:param dtd_url: the location of the dtd file
Returns:
etree.DTD: the dtd object to be used for validation
"""
dtd = None
r = open(dtd_url)
if r:
dtd_file_object = StringIO(r.read())
dtd = etree.DTD(dtd_file_object)
if dtd is None:
logging.error("failed to load dtd from" + str(dtd_url))
else:
logging.info("Succesfully to load dtd from" + str(dtd_url))
return dtd