Source code for nifigator.utils

# -*- coding: utf-8 -*-

import datetime
import logging
import re
import uuid
from io import StringIO

import unidecode
from lxml import etree
from rdflib.namespace import XSD
from rdflib.term import Literal
import syntok.segmenter as segmenter


[docs]def to_iri(s: str = ""):
    return (
        s.replace('"', "%22")
        .replace("µ", "mu")
        .replace("ª", "_")
        .replace("º", "_")
        .replace("'", "%27")
        .replace(">", "")
        .replace("<", "")
    )


[docs]def tokenize_text(text: list = None, forced_sentence_split_characters: list = []):
    """ """
    tokenized_text = tokenizer(text)
    tokenized_new = []
    for sentence in tokenized_text:
        tok_sent = []
        for token in sentence:
            if token["text"] in forced_sentence_split_characters:
                if tok_sent != []:
                    tokenized_new.append(tok_sent)
                tok_sent = []
            tok_sent.append(token)
        tokenized_new.append(tok_sent)
    tokenized_text = tokenized_new
    # delete empty tokens
    if tokenized_text != []:
        tokenized_text = [
            sentence if sentence[-1]["text"] != "" else sentence[:-1]
            for sentence in tokenized_text
        ]
    return tokenized_text


[docs]def replace_escape_characters(text: str = None):
    """
    Function to replace espace characters by spaces (maintaining exact character locations)

    :param text: the text where escape characters should be replaces

    """
    escape_characters = [
        "\a",  # bell
        "\b",  # back space
        "\t",  # tab
        "\n",  # new line
        "\v",  # vertical tab
        "\f",  # form feed
        "\r",  # carriage return
    ]
    escape_character_table = {
        ord(escape_character): " " for escape_character in escape_characters
    }
    return text.translate(escape_character_table)


[docs]def tokenizer(text: str = None):
    """
    Function to create list of sentences with list of words
    with text and start_char and end_char of each word

    :param text: the text to be tokenized

    """
    sentences = list()
    for paragraph in segmenter.analyze(text):
        for sentence in paragraph:
            words = list()
            for token in sentence:
                value = text[token.offset : token.offset + len(token.value)]
                if value != token.value:
                    logging.error("Error: incorrect offsets in syntok.segmenter.")
                else:
                    words.append(
                        {
                            "text": token.value,
                            "start_char": token.offset,
                            "end_char": token.offset + len(token.value),
                        }
                    )
            sentences.append(words)
    return sentences


[docs]def align_stanza_dict_offsets(stanza_dict: list = None, sentences: list = None):
    """
    Function to align the stanza dict offsets with the offsets from the tokenizer

    :param stanza_dict: the output dict from the Stanza pipeline

    :param sentences: the output of the tokenizer

    """
    # check alignment of stanza_dict and tokenized_document
    assert len(stanza_dict) == len(sentences)
    for sent_idx, sent in enumerate(stanza_dict):
        assert len(stanza_dict[sent_idx]) == len(sentences[sent_idx])

    # correct stanza_dict start_char and end_char
    for sent_idx, sent in enumerate(stanza_dict):
        for word_idx, word in enumerate(sent):
            word["start_char"] = sentences[sent_idx][word_idx]["start_char"]
            word["end_char"] = sentences[sent_idx][word_idx]["end_char"]

    return stanza_dict


[docs]def generate_uuid(uri: str = None, prefix: str = "nif-"):
    """
    Function to generate the uuid for nif

    :param uri: the uri from which the uuid should be generated

    :param prefix: the prefix of the uuid, default = "nif-"

    """
    return prefix + uuid.uuid3(uuid.NAMESPACE_DNS, uri).hex


[docs]def natural_sort(elements: list = None):
    """
    Function to sort a list of strings with numbers

    :param elements: the list to be sorted

    """

    def convert_to_int(text):
        return int(text) if text.isdigit() else text.lower()

    def alphanum_key(key):
        return [convert_to_int(c) for c in re.split("([0-9]+)", key)]

    return sorted(elements, key=alphanum_key)


[docs]def delete_accents(text: str = None, lang: str = "en"):
    """
    Function to delete accents from a string

    :param text: the string from which the accents should be deleted

    :param lang: the language of the text in the string

    """
    if lang == Literal("grc", datatype=XSD.string):
        replacements = {
            "ἒ": "ἐ",
            "ἓ": "ἑ",
            "ἔ": "ἐ",
            "ἕ": "ἑ",
            "έ": "ε",
            "ὲ": "ε",
            "έ": "ε",
            "ἂ": "ἀ",
            "ἃ": "ἁ",
            "ἄ": "ἀ",
            "ἅ": "ἁ",
            "ά": "α",
            "ὰ": "α",
            "ά": "α",
            "ᾂ": "ᾀ",
            "ᾄ": "ᾀ",
            "ᾃ": "ᾁ",
            "ᾅ": "ᾁ",
            "ᾲ": "ᾳ",
            "ᾴ": "ᾳ",
            "ί": "ι",
            "ἲ": "ἰ",
            "ἳ": "ἱ",
            "ἴ": "ἰ",
            "ἵ": "ἱ",
            "ῒ": "ϊ",
            "ΐ": "ϊ",
            "ὶ": "ι",
            "ί": "ι",
            "ή": "η",
            "ἢ": "ἠ",
            "ἣ": "ἡ",
            "ἤ": "ἠ",
            "ἥ": "ἡ",
            "ὴ": "η",
            "ή": "η",
            "ΰ": "ϋ",
            "ύ": "υ",
            "ὒ": "ὐ",
            "ὓ": "ὑ",
            "ὔ": "ὐ",
            "ὕ": "ὑ",
            "ὺ": "υ",
            "ύ": "υ",
            "ῢ": "ϋ",
            "ΰ": "ϋ",
            "ὢ": "ὠ",
            "ὣ": "ὡ",
            "ὤ": "ὠ",
            "ὥ": "ὡ",
            "ὼ": "ω",
            "ώ": "ω",
            "ό": "ο",
            "ὂ": "ὀ",
            "ὃ": "ὁ",
            "ὄ": "ὀ",
            "ὅ": "ὁ",
            "ὸ": "ο",
            "ό": "ο",
            "ᾢ": "ᾠ",
            "ᾣ": "ᾡ",
            "ᾤ": "ᾠ",
            "ᾥ": "ᾡ",
            "ῲ": "ῳ",
            "ῴ": "ῳ",
        }
        for replacement in replacements.keys():
            text = text.replace(replacement, replacements[replacement])
    else:
        text = unidecode.unidecode(text)
    return text


[docs]def delete_diacritics(text: str = None, lang: str = "en"):
    """
    Function to delete diacritics from a string

    :param text: the string from which the diacritics should be deleted

    :param lang: the language of the text in the string

    """
    if lang == Literal("grc", datatype=XSD.string):
        replacements = {
            "Ά": "Α",
            "Ᾰ": "Α",
            "Ᾱ": "Α",
            "Ὰ": "Α",
            "Ά": "Α",
            "Έ": "Ε",
            "Ὲ": "Ε",
            "Έ": "Ε",
            "Ή": "Η",
            "Ὴ": "Η",
            "Ή": "Η",
            "Ί": "Ι",
            "Ϊ": "Ι",
            "Ό": "Ο",
            "Ὸ": "Ο",
            "Ό": "Ο",
            "Ύ": "Υ",
            "Ϋ": "Υ",
            "Ώ": "Ω",
            "ϓ": "ϒ",
            "ϔ": "ϒ",
            "Ὑ": "ϒ",
            "Ὓ": "ϒ",
            "Ὕ": "ϒ",
            "Ὗ": "ϒ",
            "Ῠ": "ϒ",
            "Ῡ": "ϒ",
            "Ὺ": "ϒ",
            "Ύ": "ϒ",
            "ἀ": "α",
            "ἁ": "α",
            "ἂ": "α",
            "ἃ": "α",
            "ἄ": "α",
            "ἅ": "α",
            "ἆ": "α",
            "ἇ": "α",
            "ά": "α",
            "ὰ": "α",
            "ά": "α",
            "ᾰ": "α",
            "ᾱ": "α",
            "ᾶ": "α",
            "Ἀ": "Α",
            "Ἁ": "Α",
            "Ἂ": "Α",
            "Ἃ": "Α",
            "Ἄ": "Α",
            "Ἅ": "Α",
            "Ἆ": "Α",
            "Ἇ": "Α",
            "ἐ": "ε",
            "ἑ": "ε",
            "ἒ": "ε",
            "ἓ": "ε",
            "ἔ": "ε",
            "ἕ": "ε",
            "έ": "ε",
            "ὲ": "ε",
            "έ": "ε",
            "Ἐ": "Ε",
            "Ἑ": "Ε",
            "Ἒ": "Ε",
            "Ἓ": "Ε",
            "Ἔ": "Ε",
            "Ἕ": "Ε",
            "ἠ": "η",
            "ἡ": "η",
            "ἢ": "η",
            "ἣ": "η",
            "ἤ": "η",
            "ἥ": "η",
            "ἦ": "η",
            "ἧ": "η",
            "ή": "η",
            "ὴ": "η",
            "ή": "η",
            "ῆ": "η",
            "Ἠ": "Η",
            "Ἡ": "Η",
            "Ἢ": "Η",
            "Ἣ": "Η",
            "Ἤ": "Η",
            "Ἥ": "Η",
            "Ἦ": "Η",
            "Ἧ": "Η",
            "ἰ": "ι",
            "ἱ": "ι",
            "ἲ": "ι",
            "ἳ": "ι",
            "ἴ": "ι",
            "ἵ": "ι",
            "ἶ": "ι",
            "ἷ": "ι",
            "ΐ": "ι",
            "ϊ": "ι",
            "ί": "ι",
            "ὶ": "ι",
            "ί": "ι",
            "ῐ": "ι",
            "ῑ": "ι",
            "ῒ": "ι",
            "ΐ": "ι",
            "ῖ": "ι",
            "ῗ": "ι",
            "ΰ": "υ",
            "ϋ": "υ",
            "ύ": "υ",
            "ὐ": "υ",
            "ὑ": "υ",
            "ὒ": "υ",
            "ὓ": "υ",
            "ὔ": "υ",
            "ὕ": "υ",
            "ὖ": "υ",
            "ὗ": "υ",
            "ὺ": "υ",
            "ύ": "υ",
            "ῠ": "υ",
            "ῡ": "υ",
            "ῢ": "υ",
            "ΰ": "υ",
            "ῦ": "υ",
            "ῧ": "υ",
            "ό": "ο",
            "ὀ": "ο",
            "ὁ": "ο",
            "ὂ": "ο",
            "ὃ": "ο",
            "ὄ": "ο",
            "ὅ": "ο",
            "ὸ": "ο",
            "ό": "ο",
            "ώ": "ω",
            "ὠ": "ω",
            "ὡ": "ω",
            "ὢ": "ω",
            "ὣ": "ω",
            "ὤ": "ω",
            "ὥ": "ω",
            "ὦ": "ω",
            "ὧ": "ω",
            "ὼ": "ω",
            "ώ": "ω",
            "ῶ": "ω",
            "Ἰ": "Ι",
            "Ἱ": "Ι",
            "Ἲ": "Ι",
            "Ἳ": "Ι",
            "Ἴ": "Ι",
            "Ἵ": "Ι",
            "Ἶ": "Ι",
            "Ἷ": "Ι",
            "Ῐ": "Ι",
            "Ῑ": "Ι",
            "Ὶ": "Ι",
            "Ί": "Ι",
            "Ὀ": "Ο",
            "Ὁ": "Ο",
            "Ὂ": "Ο",
            "Ὃ": "Ο",
            "Ὄ": "Ο",
            "Ὅ": "Ο",
            "Ὠ": "Ω",
            "Ὡ": "Ω",
            "Ὢ": "Ω",
            "Ὣ": "Ω",
            "Ὤ": "Ω",
            "Ὥ": "Ω",
            "Ὦ": "Ω",
            "Ὧ": "Ω",
            "ᾀ": "ᾳ",
            "ᾁ": "ᾳ",
            "ᾂ": "ᾳ",
            "ᾃ": "ᾳ",
            "ᾄ": "ᾳ",
            "ᾅ": "ᾳ",
            "ᾆ": "ᾳ",
            "ᾇ": "ᾳ",
            "ᾲ": "ᾳ",
            "ᾴ": "ᾳ",
            "ᾷ": "ᾳ",
            "ᾈ": "ᾼ",
            "ᾉ": "ᾼ",
            "ᾊ": "ᾼ",
            "ᾋ": "ᾼ",
            "ᾌ": "ᾼ",
            "ᾍ": "ᾼ",
            "ᾎ": "ᾼ",
            "ᾏ": "ᾼ",
            "ᾐ": "ῃ",
            "ᾑ": "ῃ",
            "ᾒ": "ῃ",
            "ᾓ": "ῃ",
            "ᾔ": "ῃ",
            "ᾕ": "ῃ",
            "ᾖ": "ῃ",
            "ᾗ": "ῃ",
            "ῂ": "ῃ",
            "ῄ": "ῃ",
            "ῇ": "ῃ",
            "ᾘ": "ῌ",
            "ᾙ": "ῌ",
            "ᾚ": "ῌ",
            "ᾛ": "ῌ",
            "ᾜ": "ῌ",
            "ᾝ": "ῌ",
            "ᾞ": "ῌ",
            "ᾟ": "ῌ",
            "ᾠ": "ῳ",
            "ᾡ": "ῳ",
            "ᾢ": "ῳ",
            "ᾣ": "ῳ",
            "ᾤ": "ῳ",
            "ᾥ": "ῳ",
            "ᾦ": "ῳ",
            "ᾧ": "ῳ",
            "ῲ": "ῳ",
            "ῴ": "ῳ",
            "ῷ": "ῳ",
            "ᾨ": "ῼ",
            "ᾩ": "ῼ",
            "ᾪ": "ῼ",
            "ᾫ": "ῼ",
            "ᾬ": "ῼ",
            "ᾭ": "ῼ",
            "ᾮ": "ῼ",
            "ᾯ": "ῼ",
            "ῤ": "ρ",
            "ῥ": "ρ",
            "Ῥ": "Ρ",
            "Ὼ": "Ω",
            "Ώ": "Ω",
            "ꭥ": "Ω",
        }
        for replacement in replacements.keys():
            text = text.replace(replacement, replacements[replacement])
    else:
        text = unidecode.unidecode(text)
    return text


[docs]def time_in_correct_format(datetime_obj: datetime.datetime) -> str:
    """
    Function that returns the current time (UTC)

    :param datetime_obj: the input to be converted

    Returns:
        str: the time in correct format

    """
    return datetime_obj.strftime("%Y-%m-%dT%H:%M:%SUTC")


[docs]def load_dtd(dtd_url: str) -> etree.DTD:
    """Utility function to load the dtd

    :param dtd_url: the location of the dtd file

    Returns:
        etree.DTD: the dtd object to be used for validation

    """
    dtd = None
    r = open(dtd_url)
    if r:
        dtd_file_object = StringIO(r.read())
        dtd = etree.DTD(dtd_file_object)
    if dtd is None:
        logging.error("failed to load dtd from" + str(dtd_url))
    else:
        logging.info("Succesfully to load dtd from" + str(dtd_url))
    return dtd


[docs]def prepare_comment_text(text: str) -> str:
    """
    Function to prepare comment text for xml

    :param text: comment to be converted to xml comment

    Returns:
        str: converted comment text

    """
    text = text.replace("--", "DOUBLEDASH")
    if text.endswith("-"):
        text = text[:-1] + "SINGLEDASH"
    return text