Source code for nifigator.converters

# coding: utf-8

"""converters"""

import logging
from rdflib.namespace import DC, DCTERMS, XSD
from rdflib.term import Literal, URIRef

from .const import OLIA, mapobject
from .nafdocument import NafDocument
from .nifobjects import (
    NifContext,
    NifContextCollection,
    NifPage,
    NifParagraph,
    NifPhrase,
    NifSentence,
    NifWord,
)


[docs]def nafConverter( collection_name: str = None, context_name: str = None, nafdocument: NafDocument = None, base_uri: str = None, base_prefix: str = None, URIScheme: str = None, ): collection_uri = base_uri + collection_name context_uri = base_uri + context_name # create nif:collection nif_collection = NifContextCollection(uri=collection_uri) # create nif:context if nafdocument.raw is None: doc_raw = "" else: doc_raw = nafdocument.raw # create NifContext nif_context = NifContext( isString=doc_raw, uri=URIRef(context_uri), URIScheme=URIScheme, ) nif_context.set_referenceContext(nif_context) nif_collection.add_context(nif_context) # set metadata metadata = nafdocument.header["public"] metadata = { URIRef(key.replace("{", "").replace("}", "")): Literal( metadata[key], datatype=XSD.string ) for key in metadata.keys() } metadata[DC.language] = Literal(nafdocument.language, datatype=XSD.string) metadata[DCTERMS.created] = Literal( nafdocument.header["fileDesc"]["creationtime"], datatype=XSD.string ) metadata[DCTERMS.provenance] = Literal( nafdocument.header["fileDesc"]["filename"], datatype=XSD.string ) # correction in naf files, dc:uri is incorrect, should be dcterms:URI metadata[DCTERMS.URI] = metadata[URIRef("http://purl.org/dc/elements/1.1/uri")] del metadata[URIRef("http://purl.org/dc/elements/1.1/uri")] metadata[DCTERMS.identifier] = Literal(nif_context.uri, datatype=XSD.string) nif_context.set_metadata(metadata) # create nif:sentence and nif:word doc_words = {word["id"]: word for word in nafdocument.text} doc_terms = {term["id"]: term for term in nafdocument.terms} doc_sentences = nafdocument.sentences for sent_idx, sentence in enumerate(doc_sentences): beginIndex = int(doc_words[sentence["span"][0]["id"]]["offset"]) endIndex = int(doc_words[sentence["span"][-1]["id"]]["offset"]) + int( doc_words[sentence["span"][-1]["id"]]["length"] ) nif_sentence = NifSentence( uri=base_uri + context_name, beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, URIScheme=URIScheme # annotation reference missing ) sentence["nif"] = nif_sentence nif_words = dict() for sent_idx, sentence in enumerate(doc_sentences): nif_context.add_sentence(sentence["nif"]) # Add nextSentence and previousSentence to make graph traversable if sent_idx < len(doc_sentences) - 1: sentence["nif"].set_nextSentence(doc_sentences[sent_idx + 1]["nif"]) if sent_idx > 0: sentence["nif"].set_previousSentence(doc_sentences[sent_idx - 1]["nif"]) for word_idx, word_id in enumerate(sentence["span"]): word = doc_words[word_id["id"]] beginIndex = int(word["offset"]) endIndex = int(word["offset"]) + int(word["length"]) nif_word = NifWord( beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, nifsentence=sentence["nif"], # annotation reference missing uri=base_uri + context_name, URIScheme=URIScheme, ) word["nif"] = nif_word nif_words[nif_word.uri] = nif_word # Add nextWord and previousWord for word_idx, word_id in enumerate(sentence["span"]): word = doc_words[word_id["id"]] sentence["nif"].add_word(word["nif"]) if word_idx < len(sentence["span"]) - 1: word["nif"].set_nextWord( doc_words[sentence["span"][word_idx + 1]["id"]]["nif"] ) if word_idx > 0: word["nif"].set_previousWord( doc_words[sentence["span"][word_idx - 1]["id"]]["nif"] ) for term in sentence["terms"]: term_words = [s["id"] for s in doc_terms[term["id"]]["span"]] beginIndex = int(doc_words[term_words[0]]["offset"]) endIndex = int(doc_words[term_words[-1]]["offset"]) + int( doc_words[term_words[-1]]["length"] ) term_lemma = doc_terms[term["id"]].get("lemma", None) term_pos = doc_terms[term["id"]].get("pos", None) term_pos = mapobject("pos", term_pos.lower()).replace("olia:", "") term_pos = [OLIA[term_pos]] term_morphofeats = [] morphofeats = doc_terms[term["id"]].get("morphofeat", None) if morphofeats is not None: for feat in morphofeats.split("|"): if ( feat.split("=")[0] in ["Foreign", "Reflex", "Poss", "Abbr"] and feat.split("=")[1] == "Yes" ): olia_term = ( feat.split("=")[0] .replace("Poss", "PossessivePronoun") .replace("Abbr", "Abbreviation") .replace("Reflex", "ReflexivePronoun") ) term_morphofeats.append(olia_term) else: term_morphofeats.append( mapobject(feat.split("=")[0], feat.split("=")[1]).replace( "olia:", "" ) ) term_morphofeats = [OLIA[m] for m in term_morphofeats] nif_term = NifWord( beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, lemma=term_lemma, pos=term_pos, morphofeats=term_morphofeats, # annotation reference missing uri=base_uri + context_name, URIScheme=URIScheme, ) doc_terms[term["id"]]["nif"] = nif_term if nif_term.uri not in nif_words.keys(): nif_words[nif_term.uri] = nif_term else: nif_words[nif_term.uri].set_lemma(term_lemma) nif_words[nif_term.uri].set_pos(term_pos) nif_words[nif_term.uri].set_morphofeats(term_morphofeats) # create nif:page nif_pages = [] if len(nafdocument.text) > 0: page_number = int(nafdocument.text[0]["page"]) page_start = int(nafdocument.text[0]["offset"]) page_end = int(nafdocument.text[0]["offset"]) else: page_number = 1 page_start = 0 page_end = 0 beginIndex = page_start endIndex = page_end for word in nafdocument.text: if int(word["page"]) != page_number: nif_page = NifPage( beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, uri=base_uri + context_name, URIScheme=URIScheme, ) beginIndex = int(word["offset"]) endIndex = int(word["offset"]) + int(word["length"]) nif_pages.append(nif_page) page_number += 1 endIndex = int(word["offset"]) + int(word["length"]) nif_page = NifPage( beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, uri=base_uri + context_name, URIScheme=URIScheme, ) nif_pages.append(nif_page) nif_context.set_Pages(page for page in nif_pages) # create nif:phrases nif_phrases = [] for entity in nafdocument.entities: taClassRef = "https://stanfordnlp.github.io/stanza#" + entity.get( "type", "unknown" ) entity_words = [ ss["id"] for s in entity["span"] for ss in doc_terms[s["id"]]["span"] ] beginIndex = int(doc_words[entity_words[0]]["offset"]) endIndex = int(doc_words[entity_words[-1]]["offset"]) + int( doc_words[entity_words[-1]]["length"] ) nif_phrase = NifPhrase( beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, taClassRef=URIRef(taClassRef), entityOccurrence=True, uri=base_uri + context_name, URIScheme=URIScheme, ) nif_phrases.append(nif_phrase) nif_context.set_Phrases(nif_phrases) # Add dependencies: for dep in nafdocument.deps: from_term = doc_terms[dep["from_term"]] to_term = doc_terms[dep["to_term"]] rfunc = dep["rfunc"] if "nif" in from_term.keys() and "nif" in to_term.keys(): from_term["nif"].add_dependency(to_term["nif"]) from_term["nif"].set_dependencyRelationType(rfunc) else: if "nif" not in from_term.keys(): logging.warning( ".. from term in dependency not found:\n" + str(from_term) ) if "nif" not in to_term.keys(): logging.warning(".. to term in dependency not found:\n" + str(to_term)) # create nif:paragraph doc_paragraphs = nafdocument.paragraphs for para_idx, paragraph in enumerate(doc_paragraphs): if paragraph["span"] != []: beginIndex = int(doc_words[paragraph["span"][0]["id"]]["offset"]) endIndex = int(doc_words[paragraph["span"][-1]["id"]]["offset"]) + int( doc_words[paragraph["span"][-1]["id"]]["length"] ) nif_paragraph = NifParagraph( beginIndex=beginIndex, endIndex=endIndex, referenceContext=nif_context, # annotation reference missing uri=base_uri + context_name, URIScheme=URIScheme, ) paragraph["nif"] = nif_paragraph nif_context.set_Paragraphs(paragraph["nif"] for paragraph in doc_paragraphs) return nif_collection