Source code for nifigator.nifobjects

# -*- coding: utf-8 -*-

import logging
from collections import OrderedDict, defaultdict, deque
from typing import Union, List

import iribaker
from rdflib import Graph
from rdflib.namespace import DC, DCTERMS, RDF, XSD
from rdflib.term import IdentifiedNode, Literal, URIRef

from .const import (
    ITSRDF,
    NIF,
    NIF_ONTOLOGY,
    OLIA,
    EntityOccurrence,
    OffsetBasedString,
    RFC5147String,
    TermOccurrence,
    mapobject,
    upos2olia,
)
from .utils import tokenize_text, delete_accents, delete_diacritics, natural_sort


class NifContext:
    pass


class NifSentence:
    pass


class NifParagraph:
    pass


class NifPage:
    pass


class NifPhrase:
    pass


class NifWord:
    pass


[docs]class NifBase(object): """ A NIF Base :param uri: the uri of the object """ def __init__(self, uri: Union[URIRef, str] = None): self.set_uri(uri) def __eq__(self, other): return self._uri == other._uri @property def uri(self): """ Returns the uri of the object """ if self._uri is not None: return self._uri else: return None
[docs] def set_uri(self, uri: Union[URIRef, str] = None): """ Sets the uri of the object. If the uri is a string then it is converted to an iri. """ if isinstance(uri, str): self._uri = URIRef(iribaker.to_iri(uri)) else: self._uri = uri
[docs]class NifString(NifBase): """ A NIF String :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param referenceContext: the context to which the string refers """ def __init__( self, URIScheme: str = None, base_uri: URIRef = None, uri: URIRef = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, referenceContext: NifContext = None, graph: Graph = None, ): self.set_graph(graph) self.set_URIScheme(URIScheme) self.set_beginIndex(beginIndex) self.set_endIndex(endIndex) self.set_base_uri(base_uri) self.set_uri(uri) self.set_referenceContext(referenceContext) def __eq__(self, other): return ( (self._URIScheme == other._URIScheme) & (self._beginIndex == other._beginIndex) & (self._endIndex == other._endIndex) & (self._referenceContext.uri == other._referenceContext.uri) & super(NifBase, self).__eq__(other) ) @property def beginIndex(self): """ Returns the start index of the context string as an `int`. """ if self._beginIndex is not None: return int(self._beginIndex) elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.beginIndex): return int(item) else: return None @property def endIndex(self): """ Returns the end index of the context string as an `int`. """ if self._endIndex is not None: return int(self._endIndex) elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.endIndex): return int(item) else: return None @property def referenceContext(self): """ Returns the context of the current object """ if self._referenceContext is not None: return self._referenceContext else: return None @property def anchorOf(self): """ Returns the string of the object as a `str`. The anchorOf is not store in the object but extracted from the referenceContext. """ return self.referenceContext.isString[self.beginIndex : self.endIndex] @property def anchorOf_no_accents(self): """ Returns the string without accents of the object as a `str`. """ anchorOf = self.referenceContext.isString[self.beginIndex : self.endIndex] if self._referenceContext.metadata is not None: lang = self._referenceContext._metadata.get( DC.language, Literal("en", datatype=XSD.string) ) else: lang = Literal("en", datatype=XSD.string) return delete_accents(anchorOf, lang=lang) @property def anchorOf_no_diacritics(self): """ Returns the string without diacritics of the object as a `str`. """ anchorOf = self.referenceContext.isString[self.beginIndex : self.endIndex] if self._referenceContext.metadata is not None: lang = self._referenceContext._metadata.get( DC.language, Literal("en", datatype=XSD.string) ) else: lang = Literal("en", datatype=XSD.string) return delete_diacritics(anchorOf, lang=lang) @property def URIScheme(self): """ Returns the URIScheme """ if self._URIScheme is not None: return self._URIScheme elif self.graph is not None: for o in self.graph.objects(subject=self.uri, predicate=RDF.type): if o == NIF.OffsetBasedString or o == NIF.RFC5147String: self._URIScheme = o return self._URIScheme else: return None
[docs] def set_base_uri(self, base_uri: URIRef = None): """ Sets the base uri of the object """ self._base_uri = base_uri
[docs] def set_uri(self, uri: URIRef = None): """ Sets the uri of the object """ if uri is None: if self._base_uri is not None: base_uri = self._base_uri.replace("&nif=context", "") if isinstance(self, NifContext): uri = base_uri + "&nif=context" elif isinstance(self, NifContextCollection): uri = base_uri + "&nif=collection" elif isinstance(self, NifPage): uri = base_uri + "&nif=page" elif isinstance(self, NifParagraph): uri = base_uri + "&nif=paragraph" elif isinstance(self, NifSentence): uri = base_uri + "&nif=sentence" elif isinstance(self, NifPhrase): uri = base_uri + "&nif=phrase" elif isinstance(self, NifWord): uri = base_uri + "&nif=word" if not isinstance(self, NifContext): if self.URIScheme == RFC5147String: uri = ( uri + "#char=" + str(self.beginIndex) + "," + str(self.endIndex) ) else: # default is OffsetBasedString: uri = ( uri + "_" + str(self.beginIndex) + "_" + str(self.endIndex) ) super().set_uri(uri=uri)
[docs] def set_beginIndex(self, beginIndex: Union[Literal, int] = None): """ Sets the start of the index of the string. The type of beginIndex can be a `Literal` or an `int`. If the type is an `int` then it is converted to a Literal. """ if isinstance(beginIndex, int) or isinstance(beginIndex, str): self._beginIndex = Literal(beginIndex, datatype=XSD.nonNegativeInteger) else: self._beginIndex = beginIndex
[docs] def set_endIndex(self, endIndex: Union[Literal, int] = None): """ Sets the end of the index of the string. The type of endIndex can be a `Literal` or an `int`. If the type is an `int` then it is converted to a Literal. """ if isinstance(endIndex, int) or isinstance(endIndex, str): self._endIndex = Literal(endIndex, datatype=XSD.nonNegativeInteger) else: self._endIndex = endIndex
[docs] def set_referenceContext(self, referenceContext: NifContext = None): """ Sets the referenceContext of the object. """ if referenceContext is not None: self._referenceContext = referenceContext
[docs] def set_anchorOf(self, anchorOf: Union[str, Literal] = None): """ The anchorOf should be consistent with the string in the referenceContext, otherwise an error is logged. """ if self.anchorOf is not None: if str(anchorOf) != self.anchorOf: logging.error( "Inconsistency in anchorOf string and (part in) referenceContext string: " + str(uri) )
# if isinstance(anchorOf, str): # self._anchorOf = Literal(anchorOf, datatype=XSD.string) # elif isinstance(anchorOf, Literal): # self._anchorOf = anchorOf
[docs] def set_URIScheme(self, URIScheme: str = None): """ Sets the URIScheme of the object """ self._URIScheme = URIScheme
[docs] def set_graph(self, graph: Graph = None): self.graph = graph
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: if self._URIScheme == OffsetBasedString: yield (self.uri, RDF.type, NIF.OffsetBasedString) elif self._URIScheme == RFC5147String: yield (self.uri, RDF.type, NIF.RFC5147String) yield (self.uri, RDF.type, NIF.String) if self._beginIndex is not None: yield (self.uri, NIF.beginIndex, self._beginIndex) if self._endIndex is not None: yield (self.uri, NIF.endIndex, self._endIndex) if self._referenceContext is not None: yield (self.uri, NIF.referenceContext, self._referenceContext.uri)
[docs]class NifContext(NifString): """ A NIF Context :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param sourceUrl: the source url of the context :param predLang: the predominant language of the context :param isString: the string of the context :param metadata: a list of URIRefs with metadata """ def __init__( self, URIScheme: str = None, base_uri: URIRef = None, uri: URIRef = None, sourceUrl: URIRef = None, predLang: URIRef = None, isString: Union[Literal, str] = None, metadata: dict = None, lexicon: URIRef = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=0 if isString is not None else None, endIndex=len(isString) if isString is not None else None, referenceContext=self, graph=graph, ) self.set_Sentences(None) self.set_Paragraphs(None) self.set_Pages(None) self.set_Phrases(None) self.set_sourceUrl(sourceUrl) self.set_predLang(predLang) self.set_isString(isString) self.set_metadata(metadata) self.set_lexicon(lexicon) def __str__(self): return self.__repr__() def __repr__(self): s = f"(nif:Context) uri = {self.uri.n3()}\n" if self.sourceUrl is not None: s += f" sourceUrl : {self.sourceUrl.n3()}\n" if self.predLang is not None: s += f" predLang : {self.predLang.n3()}\n" if self.isString is not None: if len(self.isString) > 1000: s += f' isString : {repr(self.isString[0:1000]+"... ")}\n' else: s += f" isString : {repr(self.isString)}\n" if self.firstSentence is not None: s += f" firstSentence : {repr(self.firstSentence.anchorOf)}\n" if self.lastSentence is not None: s += f" lastSentence : {repr(self.lastSentence.anchorOf)}\n" if self._metadata is not None and self._metadata != {}: for d in self._metadata.keys(): s += f" {d} : {self._metadata[d]}\n" return s def __eq__(self, other): return ( (self._URIScheme == other._URIScheme) & (self._uri == other._uri) & (self.sourceUrl == other.sourceUrl) & (self.predLang == other.predLang) & (self.isString == other.isString) & (self.metadata == other.metadata) & super(NifBase, self).__eq__(other) ) @property def metadata(self): """ Returns the metadata of the context """ if self._metadata is not None: return self._metadata elif self.graph is not None: metadata = {} for p, o in self.graph.predicate_objects(subject=self.uri): if p in DC or p in DCTERMS: metadata[p] = o return metadata else: return None @property def sourceUrl(self): """ Returns the sourceUrl of the context """ if self._sourceUrl is not None: return self._sourceUrl elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.sourceUrl): return item else: return None @property def predLang(self): """ Returns the predLang of the context """ if self._predLang is not None: return self._predLang elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.predLang): if isinstance(item, Literal): return item.value else: return item else: return None @property def isString(self): """ Returns the isString of the context """ if self._isString is not None: return self._isString.value elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.isString): self.set_isString(item.value) return item.value else: return None @property def firstSentence(self): """ Returns the first sentence of the context. """ if self._sentences is None and self.graph is not None: self.load_sentences() if self._sentences is not None: return self._sentences[0] else: return None @property def lastSentence(self): """ Returns the last sentence of the context. """ if self._sentences is None and self.graph is not None: self.load_sentences() if self._sentences is not None: return self._sentences[-1] else: return None @property def sentences(self): """ Returns all sentences in the context as a list. """ if self._sentences is None and self.graph is not None: self.load_sentences() if self._sentences is not None: return list(self._sentences) else: return None @property def firstParagraph(self): """ Returns the first paragraph of the context. """ if self._paragraphs is None and self.graph is not None: self.load_paragraphs() if self._paragraphs is not None: return self._paragraphs[0] else: return None @property def lastParagraph(self): """ Returns the last paragraph of the context. """ if self._paragraphs is None and self.graph is not None: self.load_paragraphs() if self._paragraphs is not None: return self._paragraphs[-1] else: return None @property def paragraphs(self): """ Returns all paragraphs in the context as a list. """ if self._paragraphs is None and self.graph is not None: self.load_paragraphs() if self._paragraphs is not None: return list(self._paragraphs) else: return None @property def firstPage(self): """ Returns the first page of the context. """ if self._pages is None and self.graph is not None: self.load_pages() if self._pages is not None: return self._pages[0] else: return None @property def lastPage(self): """ Returns the last page of the context. """ if self._pages is None and self.graph is not None: self.load_pages() if self._pages is not None: return self._pages[-1] else: return None @property def pages(self): """ Returns all pages in the context as a list. """ if self._pages is None and self.graph is not None: self.load_pages() if self._pages is not None: return list(self._pages) else: return None @property def firstPhrase(self): """ Returns the first phrase of the context. """ if self._phrases is None and self.graph is not None: self.load_phrases() if self._phrases is not None: return self._phrases[0] else: return None @property def lastPhrase(self): """ Returns the last phrase of the context. """ if self._phrases is None and self.graph is not None: self.load_phrases() if self._phrases is not None: return self._phrases[-1] else: return None @property def phrases(self): """ Returns all phrases in the context as a list. """ if self._phrases is None and self.graph is not None: self.load_phrases() if self._phrases is not None: return list(self._phrases) else: return None @property def lexicon(self): """ Returns the lexicon """ return self._lexicon
[docs] def set_lexicon(self, lexicon: URIRef = None): """ Sets the lexicon base uri for lemmas """ if lexicon is not None: self._lexicon = lexicon else: self._lexicon = None
[docs] def set_metadata(self, metadata: dict = None): """ Sets the metadata of the context (a dict of predicates and objects) """ if metadata is not None: self._metadata = metadata else: self._metadata = {}
[docs] def set_sourceUrl(self, sourceUrl: URIRef = None): """ Sets the sourceUrl of the context """ self._sourceUrl = sourceUrl
[docs] def set_predLang(self, predLang: Union[URIRef, str] = None): """ Sets the predominant language of the context """ if predLang is not None: if not isinstance(predLang, URIRef): self._predLang = Literal(predLang) else: self._predLang = predLang else: self._predLang = None
[docs] def set_isString(self, isString: Union[Literal, str] = None): """ Sets the string of the context (rdflib.Literal or string) """ if isinstance(isString, str): self._isString = Literal(isString, datatype=XSD.string) else: self._isString = isString
[docs] def set_Pages(self, pages: list = None): """ Sets the pages of the context (a list of NifPage) """ if pages is not None and pages != []: self._pages = deque(pages) else: self._pages = None
[docs] def set_Paragraphs(self, paragraphs: list = None): """ Sets the paragraphs of the context (a list of NifParagraph) """ if paragraphs is not None and paragraphs != []: self._paragraphs = deque(paragraphs) else: self._paragraphs = None
[docs] def set_Phrases(self, phrases: list = None): """ Sets the phrases of the context (a list of NifPhrases) """ if phrases is not None and phrases != []: self._phrases = deque(phrases) else: self._phrases = None
[docs] def set_Sentences(self, sentences: list = None): """ Sets the sentences of the context (a list of NifSentence) """ if sentences is not None and sentences != []: self._sentences = deque(sentences) else: self._sentences = None
[docs] def add_sentence(self, sentence: NifSentence = None): """ Adds a sentences to the context (a NifSentence) """ if sentence is not None: if self._sentences is None: self._sentences = deque([sentence]) else: self._sentences.append(sentence)
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.Context) for key in self._metadata.keys(): yield (self.uri, key, self._metadata[key]) if self._isString is not None: yield (self.uri, NIF.isString, self._isString) if self._sourceUrl is not None: yield (self.uri, NIF.sourceUrl, self._sourceUrl) if self._predLang is not None: yield (self.uri, NIF.predLang, self._predLang) if self.firstSentence is not None: yield (self.uri, NIF.firstSentence, self.firstSentence.uri) if self.lastSentence is not None: yield (self.uri, NIF.lastSentence, self.lastSentence.uri) for triple in super().triples(objects=objects): yield triple if self._sentences is not None: for sentence in self._sentences: for triple in sentence.triples(objects=objects): yield triple if self._paragraphs is not None: for paragraph in self._paragraphs: for triple in paragraph.triples(objects=objects): yield triple if self._pages is not None: for page in self._pages: for triple in page.triples(objects=objects): yield triple if self._phrases is not None: for phrase in self._phrases: for triple in phrase.triples(objects=objects): yield triple
[docs] def extract_sentences(self, forced_sentence_split_characters: list = []): """ Tokenize the string of the context and add sentences to the context """ text_dict = tokenize_text( self.isString, forced_sentence_split_characters=forced_sentence_split_characters, ) if text_dict is not None: sent_list = [] for sent_idx, sent in enumerate(text_dict): nif_sent = NifSentence( base_uri=self.uri, beginIndex=sent[0]["start_char"], endIndex=sent[-1]["end_char"], referenceContext=self, ) sent_list.append(nif_sent) self.set_Sentences(sent_list)
[docs] def load_sentences(self): """ """ sent_uris = natural_sort( [ s for s in self.graph.subjects( predicate=NIF.referenceContext, object=self.uri ) if list(self.graph.triples([s, RDF.type, NIF.Sentence])) != [] ] ) nifsentences = [] for sent_uri in sent_uris: nifsentence = NifSentence( URIScheme=self.URIScheme, uri=sent_uri, referenceContext=self, graph=self.graph, ) word_uris = list() for s in self.graph.subjects(predicate=NIF.sentence, object=sent_uri): if (s, RDF.type, NIF.Word) in self.graph: word_uris.append(s) word_uris = natural_sort(word_uris) # extract words from graph words = OrderedDict() for word_uri in word_uris: words[word_uri] = NifWord( URIScheme=self.URIScheme, uri=word_uri, referenceContext=self.referenceContext, nifsentence=nifsentence, graph=self.graph, ) nifsentence.set_Words(words.values()) # replace dependency uris by word objects for word_idx, word in enumerate(words.values()): word.set_dependency([words[dep] for dep in word.dependency]) words = nifsentence.words if words is not None: # replace nextWord and previousWord uris by word objects for word_idx, word in enumerate(words): if word_idx > 0: word.set_previousWord(words[word_idx - 1]) if word_idx < len(words) - 1: word.set_nextWord(words[word_idx + 1]) nifsentences.append(nifsentence) self.set_Sentences(nifsentences) if len(nifsentences) > 0: sentences = self.sentences if sentences is not None: for sent_idx, sentence in enumerate(sentences): if sent_idx > 0: sentence.set_previousSentence(sentences[sent_idx - 1]) if sent_idx < len(sentences) - 1: sentence.set_nextSentence(sentences[sent_idx + 1])
[docs] def load_pages(self): page_uris = natural_sort( [ s for s in self.graph.subjects( predicate=NIF.referenceContext, object=self.uri ) if list(self.graph.triples([s, RDF.type, NIF.Page])) != [] ] ) self.set_Pages( [ NifPage( URIScheme=self.URIScheme, uri=page_uri, referenceContext=self, pageNumber=idx + 1, graph=self.graph, ) for idx, page_uri in enumerate(page_uris) ] )
[docs] def load_paragraphs(self): para_uris = natural_sort( [ s for s in self.graph.subjects( predicate=NIF.referenceContext, object=self.uri ) if list(self.graph.triples([s, RDF.type, NIF.Paragraph])) != [] ] ) # extract paragraphs from graph self.set_Paragraphs( [ NifParagraph( URIScheme=self.URIScheme, uri=para_uri, referenceContext=self, graph=self.graph, ) for para_uri in para_uris ] )
[docs] def load_phrases(self): phrase_uris = natural_sort( [ s for s in self.graph.subjects( predicate=NIF.referenceContext, object=self.uri ) if list(self.graph.triples([s, RDF.type, NIF.Phrase])) != [] ] ) # extract phrases from graph self.set_Phrases( [ NifPhrase( URIScheme=self.URIScheme, uri=phrase_uri, referenceContext=self, graph=self.graph, ) for phrase_uri in phrase_uris ] )
[docs] def load_from_dict(self, stanza_dict: list = None): """ Load a context from stanza dictionary """ if stanza_dict is not None: for sent_idx, sent in enumerate(stanza_dict): nif_sent = NifSentence( base_uri=self.uri, beginIndex=sent[0]["start_char"], endIndex=sent[-1]["end_char"], referenceContext=self, URIScheme=self.URIScheme, ) self.add_sentence(nif_sent) for word_idx, word in enumerate(sent): nif_word = NifWord( base_uri=self.uri, beginIndex=word["start_char"], endIndex=word["end_char"], referenceContext=self, nifsentence=nif_sent, URIScheme=self.URIScheme, ) nif_sent.add_word(nif_word) nif_word.set_lemma(word.get("lemma", None)) upos = word.get("upos", None) if upos is not None: if upos in upos2olia.keys(): nif_word.add_pos(upos2olia.get(word["upos"])) else: logging.error( ".. part-of-speech tag not found: " + word["upos"] ) feats = word.get("feats", None) if feats is not None: for i in feats.split("|"): p = i.split("=")[0] o = i.split("=")[1] olia = mapobject(p, o) if olia is not None: nif_word.add_morphofeat(URIRef(olia)) for word_idx, word in enumerate(sent): nif_sent._words[word_idx].set_dependencyRelationType( word.get("deprel", None) ) dep = word.get("head", None) if dep is not None: if dep != 0: # if dep is 0 then it is the root nif_sent._words[word_idx].set_dependency( [nif_sent._words[dep - 1]] ) words = nif_sent._words if words is not None: for word_idx, word in enumerate(words): if word_idx < len(words) - 1: word.set_nextWord(words[word_idx + 1]) if word_idx > 0: word.set_previousWord(words[word_idx - 1]) sentences = self.sentences if sentences is not None: for sent_idx, sentence in enumerate(sentences): if sent_idx < len(sentences) - 1: sentence.set_nextSentence(sentences[sent_idx + 1]) if sent_idx > 0: sentence.set_previousSentence(sentences[sent_idx - 1]) pages = self.pages if pages is not None: # set the pages of each sentence where it occurs page_idx = 0 for sentence in self.sentences: sentence.add_page(pages[page_idx]) if page_idx < len(pages) - 1: while sentence.endIndex > pages[page_idx].endIndex: page_idx += 1 sentence.add_page(pages[page_idx])
[docs]class NifStructure(NifString): """ A NIF Structure :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param referenceContext: the context to which the string refers """ def __init__( self, base_uri: URIRef = None, uri: URIRef = None, URIScheme: str = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, referenceContext: NifContext = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=beginIndex, endIndex=endIndex, referenceContext=referenceContext, graph=graph, )
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: for triple in super().triples(objects=objects): yield triple
[docs]class NifPhrase(NifStructure): """ A NIF Phrase :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param nifsentence: the sentence of the word :param referenceContext: the context to which the string refers :param taIdentRef: text analysis identifier reference :param taClassRef: text analysis class reference :param taConfidence: confidence of the annotation :param PhraseType: type of phrase (EntityOccurrence, TermOccurrence) """ def __init__( self, base_uri: URIRef = None, uri: URIRef = None, URIScheme: str = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, nifsentence: NifSentence = None, referenceContext: NifContext = None, taIdentRef: URIRef = None, taClassRef: URIRef = None, taConfidence: Union[Literal, float] = None, PhraseType: str = None, nextPhrase: NifPhrase = None, previousPhrase: NifPhrase = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=beginIndex, endIndex=endIndex, referenceContext=referenceContext, graph=graph, ) self.set_nifsentence(nifsentence) self.set_taIdentRef(taIdentRef) self.set_taClassRef(taClassRef) self.set_taConfidence(taConfidence) self.set_PhraseType(PhraseType) self._set_nextPhrase(nextPhrase) self._set_previousPhrase(previousPhrase) def __str__(self): return self.__repr__() def __repr__(self): if self._PhraseType == EntityOccurrence: s = f"(nif:EntityOccurrence) uri = {self.uri.n3()}\n" else: s = f"(nif:TermOccurrence) uri = {self.uri.n3()}\n" if self.referenceContext is not None: s += f" referenceContext : {self.referenceContext.uri}\n" if self.nifsentence is not None: s += f" nifsentence : {self.nifsentence.uri}\n" if self.beginIndex is not None: s += f" beginIndex : {self.beginIndex}\n" if self.endIndex is not None: s += f" endIndex : {self.endIndex}\n" if self.anchorOf is not None: s += f' anchorOf : "{self.anchorOf}"\n' if self.taIdentRef is not None: s += f" taIdentRef : {self.taIdentRef}\n" if self.taClassRef is not None: s += f" taClassRef : {self.taClassRef}\n" if self.taConfidence is not None: s += f" taConfidence : {self.taConfidence}\n" return s @property def nifsentence(self): """ Returns the sentence to which the word belongs """ if self._nifsentence is not None: return self._nifsentence else: return None @property def PhraseType(self): """ Returns the phrasetype (entity or term occurrence) """ if self._PhraseType is not None: return self._PhraseType elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=RDF.type): if o in [TermOccurrence, EntityOccurrence]: return o else: return None @property def taIdentRef(self): """ Returns text analysis identifier reference """ if self._taIdentRef is not None: return self._taIdentRef elif self.graph is not None: for item in self.graph.objects( subject=self.uri, predicate=ITSRDF.taIdentRef ): return item else: return None @property def taClassRef(self): """ Returns text analysis class reference """ if self._taClassRef is not None: return self._taClassRef elif self.graph is not None: for item in self.graph.objects( subject=self.uri, predicate=ITSRDF.taClassRef ): return item else: return None @property def taConfidence(self): """ Returns text analysis confidence """ if self._taConfidence is not None: return float(self._taConfidence.value) elif self.graph is not None: for item in self.graph.objects( subject=self.uri, predicate=ITSRDF.taConfidence ): return float(item.value) else: return None @property def nextPhrase(self): """ Returns the next phrase """ if self._nextPhrase is not None: return self._nextPhrase else: return None @property def previousPhrase(self): """ Returns the previous phrase """ if self._previousPhrase is not None: return self._previousPhrase else: return None
[docs] def set_nifsentence(self, nifsentence: NifSentence = None): """ Sets the sentence of which the word is a part """ self._nifsentence = nifsentence
[docs] def set_PhraseType(self, PhraseType: str = None): """ Sets the phrase type (EntityOccurrence or TermOccurrence) """ self._PhraseType = PhraseType
[docs] def set_taIdentRef(self, taIdentRef: Union[URIRef, str] = None): """ Sets the text analysis identifier reference (as a rdflib.URIRef) """ if isinstance(taIdentRef, str): self._taIdentRef = URIRef(taIdentRef) else: self._taIdentRef = taIdentRef
[docs] def set_taClassRef(self, taClassRef: Union[URIRef, str] = None): """ Sets the text analysis class reference (as a rdflib.URIRef) """ if isinstance(taClassRef, str): self._taClassRef = URIRef(taClassRef) else: self._taClassRef = taClassRef
[docs] def set_taConfidence(self, taConfidence: Union[Literal, float] = None): """ Sets the text analysis confidence (float) """ if isinstance(taConfidence, float) or isinstance(taConfidence, str): self._taConfidence = Literal(taConfidence, datatype=XSD.float) else: self._taConfidence = taConfidence
def _set_nextPhrase(self, nextPhrase: NifPhrase = None): """ Sets the next phrase """ self._nextPhrase = nextPhrase def _set_previousPhrase(self, previousPhrase: NifPhrase = None): """ Sets the previous phrase """ self._previousPhrase = previousPhrase
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.Phrase) if self._PhraseType == EntityOccurrence: yield (self.uri, RDF.type, NIF.EntityOccurrence) elif self._PhraseType == TermOccurrence: yield (self.uri, RDF.type, NIF.TermOccurrence) if self.nifsentence is not None: yield (self.uri, NIF.sentence, self._nifsentence.uri) if self.taClassRef is not None: yield (self.uri, ITSRDF.taClassRef, self._taClassRef) if self.taIdentRef is not None: yield (self.uri, ITSRDF.taIdentRef, self._taIdentRef) if self.taConfidence is not None: yield (self.uri, ITSRDF.taConfidence, self._taConfidence) for triple in super().triples(objects=objects): yield triple
[docs]class NifSentence(NifStructure): """ A NIF Sentence :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param referenceContext: the context to which the string refers :param nifpages: the pages where the sentence occurs :param nextSentence: the next sentence in the context :param previousSentence: the previous sentence in the context """ def __init__( self, base_uri: URIRef = None, uri: URIRef = None, URIScheme: str = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, referenceContext: NifContext = None, pages: List[NifPage] = None, nextSentence: Union[URIRef, str] = None, previousSentence: Union[URIRef, str] = None, words: List[Union[NifWord, URIRef]] = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=beginIndex, endIndex=endIndex, referenceContext=referenceContext, graph=graph, ) self.set_nextSentence(nextSentence) self.set_previousSentence(previousSentence) self.set_Words(words) self.set_pages(pages) def __str__(self): return self.__repr__() def __repr__(self): s = f"(nif:Sentence) uri = {self.uri}\n" if self.referenceContext is not None: s += f" referenceContext : {self.referenceContext.uri}\n" if self.pages is not None: s += f' pages : {", ".join([page.uri for page in self.pages])}\n' if self.beginIndex is not None: s += f" beginIndex : {self.beginIndex}\n" if self.endIndex is not None: s += f" endIndex : {self.endIndex}\n" if self.anchorOf is not None: s += f" anchorOf : {repr(self.anchorOf)}\n" if self.nextSentence is not None: if len(self.nextSentence.anchorOf) > 100: s += f" nextSentence : {repr(self.nextSentence.anchorOf[0:100])}...\n" else: s += f" nextSentence : {repr(self.nextSentence.anchorOf)}\n" if self.previousSentence is not None: if len(self.previousSentence.anchorOf) > 100: s += f" previousSentence : {repr(self.previousSentence.anchorOf[0:100])}... \n" else: s += f" previousSentence : {repr(self.previousSentence.anchorOf)}\n" if self.firstWord is not None: s += f' firstWord : "{self.firstWord.anchorOf}"\n' if self.lastWord is not None: s += f' lastWord : "{self.lastWord.anchorOf}"\n' return s @property def pages(self): if self._pages is not None: return self._pages else: return None @property def nextSentence(self): if self._nextSentence is not None: return self._nextSentence else: return None @property def previousSentence(self): if self._previousSentence is not None: return self._previousSentence else: return None @property def firstWord(self): if self._words is not None and len(self._words) > 0: return self._words[0] else: return None @property def lastWord(self): if self._words is not None and len(self._words) > 0: return self._words[-1] else: return None @property def words(self): if self._words is not None and len(self._words) > 0: return list(self._words) else: return None @property def lemmas(self): return " ".join([w.lemma for w in self.words])
[docs] def set_nextSentence(self, nextSentence: NifSentence = None): self._nextSentence = nextSentence
[docs] def set_previousSentence(self, previousSentence: NifSentence = None): self._previousSentence = previousSentence
[docs] def set_Words(self, words: list = None): if words is not None: self._words = deque(words) else: self._words = None
[docs] def set_pages(self, pages: List[NifPage] = None): if pages is not None: self._pages = pages else: self._pages = None
[docs] def add_page(self, page: NifPage = None): if page is not None: if self._pages is None: self._pages = [page] else: self._pages.append(page)
[docs] def add_word(self, word: NifWord = None): if word is not None: if self._words is None: self._words = deque([word]) else: self._words.append(word)
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.Sentence) for triple in super().triples(objects=objects): yield triple if self.pages is not None: for page in self.pages: yield (self.uri, NIF.page, page.uri) if self.nextSentence is not None: yield (self.uri, NIF.nextSentence, self.nextSentence.uri) if self.previousSentence is not None: yield (self.uri, NIF.previousSentence, self.previousSentence.uri) if self.firstWord is not None: yield (self.uri, NIF.firstWord, self.firstWord.uri) if self.lastWord is not None: yield (self.uri, NIF.lastWord, self.lastWord.uri) if self._words is not None: for word in self._words: for triple in word.triples(objects=objects): yield triple
[docs]class NifParagraph(NifStructure): """ A NIF Paragraph :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param referenceContext: the context to which the string refers """ def __init__( self, URIScheme: str = None, base_uri: URIRef = None, uri: URIRef = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, referenceContext: NifContext = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=beginIndex, endIndex=endIndex, referenceContext=referenceContext, graph=graph, ) def __str__(self): return self.__repr__() def __repr__(self): s = f"(nif:Paragraph) uri = {self.uri}\n" if self.beginIndex is not None: s += f" beginIndex : {self.beginIndex}\n" if self.endIndex is not None: s += f" endIndex : {self.endIndex}\n" if self.anchorOf is not None: if len(self.anchorOf) > 1000: s += f' anchorOf : {repr(self.anchorOf[0:1000]+"... ")}\n' else: s += f" anchorOf : {repr(self.anchorOf)}\n" return s
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.Paragraph) for triple in super().triples(objects=objects): yield triple
[docs]class NifPage(NifStructure): """ A NIF Page :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param pageNumber: the page number of the object :param referenceContext: the context to which the string refers """ def __init__( self, URIScheme: str = None, base_uri: URIRef = None, uri: URIRef = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, pageNumber: int = None, referenceContext: NifContext = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=beginIndex, endIndex=endIndex, referenceContext=referenceContext, graph=graph, ) self.set_pageNumber(pageNumber) def __str__(self): return self.__repr__() def __repr__(self): s = f"(nif:Page) uri = {self.uri}\n" if self.beginIndex is not None: s += f" beginIndex : {self.beginIndex}\n" if self.endIndex is not None: s += f" endIndex : {self.endIndex}\n" if self.anchorOf is not None: if len(self.anchorOf) > 1000: s += f' anchorOf : {repr(self.anchorOf[0:1000]+"... ")}\n' else: s += f" anchorOf : {repr(self.anchorOf)}\n" if self.pageNumber is not None and self.pageNumber != 0: s += f" pageNumber : {self.pageNumber}\n" return s
[docs] def set_pageNumber(self, pageNumber: int = None): if pageNumber is not None and pageNumber != 0: self._pageNumber = Literal(pageNumber, datatype=XSD.nonNegativeInteger) else: self._pageNumber = None
@property def pageNumber(self): if self._pageNumber is not None: return self._pageNumber.value else: return None
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.Page) if self._pageNumber is not None: yield (self.uri, NIF.pageNumber, self._pageNumber) for triple in super().triples(objects=objects): yield triple
# class NifTitle(NifStructure): # def __init__(self, uri: str=None): # super().__init__(uri)
[docs]class NifWord(NifStructure): """ A NIF Word :param URIScheme: the URIScheme of the object :param base_uri: the uri from which the uri of the object is derived :param uri: the uri of the object :param beginIndex: the start index in the context string :param endIndex: the end index in the context string :param referenceContext: the context to which the string refers :param nifsentence: the sentence of the word :param lemma: the lemma of the word :param pos: the part-of-speech tags (a list) :param morphofeats: the morphological features (a list) :param dependency: dependency relations of the word (a list) :param dependencyRelationType: the type of dependency relation of the word :param nextWord: the next word in the sentence :param previousWord: the previous word in the sentence """ def __init__( self, URIScheme: str = None, base_uri: URIRef = None, uri: URIRef = None, beginIndex: Union[Literal, int] = None, endIndex: Union[Literal, int] = None, referenceContext: NifContext = None, nifsentence: NifSentence = None, lemma: Union[URIRef, str] = None, pos: list = None, morphofeats: list = None, dependency: list = None, dependencyRelationType: str = None, nextWord: str = None, previousWord: str = None, graph: Graph = None, ): super().__init__( URIScheme=URIScheme, base_uri=base_uri, uri=uri, beginIndex=beginIndex, endIndex=endIndex, referenceContext=referenceContext, graph=graph, ) self.set_nifsentence(nifsentence) self.set_lemma(lemma) self.set_pos(pos) self.set_morphofeats(morphofeats) self.set_dependency(dependency) self.set_dependencyRelationType(dependencyRelationType) self.set_nextWord(nextWord) self.set_previousWord(previousWord) def __str__(self): return self.__repr__() def __repr__(self): s = f"(nif:Word) uri = {self.uri}\n" if self.referenceContext is not None: s += f" referenceContext : {self.referenceContext.uri}\n" if self.nifsentence is not None: s += f" nifsentence : {self.nifsentence.uri}\n" if self.beginIndex is not None: s += f" beginIndex : {self.beginIndex}\n" if self.endIndex is not None: s += f" endIndex : {self.endIndex}\n" if self.nextWord is not None: s += f' nextWord : "{self.nextWord.anchorOf}"\n' if self.previousWord is not None: s += f' previousWord : "{self.previousWord.anchorOf}"\n' if self.anchorOf is not None: s += f' anchorOf : "{self.anchorOf}"\n' if self.lemma is not None: s += f' lemma : "{self.lemma}"\n' if self.pos is not None and self.pos != []: s += f' pos : {", ".join([str(m).replace(OLIA, "olia:") for m in self.pos])}\n' if self.morphofeats is not None and self.morphofeats != []: s += f' morphofeats : {", ".join([str(m).replace(OLIA, "olia:") for m in self.morphofeats])}\n' if self.dependency is not None and self.dependency != []: s += f' dependency : {", ".join([dep.uri for dep in self.dependency])}\n' # [str(dep.uri) for dep in self.dependency] if self.dependencyRelationType is not None: s += f" dependencyRelationtype : {self.dependencyRelationType}\n" return s @property def nifsentence(self): """ Returns the sentence to which the word belongs """ if self._nifsentence is not None: return self._nifsentence else: return None @property def lemma(self): """ Returns the lemma of the word """ if self._lemma is not None: if isinstance(self._lemma, Literal): return self._lemma.value else: return self._lemma elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.lemma): if isinstance(item, Literal): return item.value else: return item else: return None @property def pos(self): """ Returns the part-of-speech (pos) of the word """ if self._pos is not None: return self._pos elif self.graph is not None: return [ item for item in self.graph.objects(subject=self.uri, predicate=NIF.pos) ] else: return None @property def morphofeats(self): """ Returns the morphological features of the word as a list """ if self._morphofeats is not None: return self._morphofeats elif self.graph is not None: return [ item for item in self.graph.objects(subject=self.uri, predicate=NIF.oliaLink) ] else: return [] @property def dependency(self): """ Returns the dependencies of the word as a list """ if self._dependency is not None: return self._dependency elif self.graph is not None: return [ item for item in self.graph.objects( subject=self.uri, predicate=NIF.dependency ) ] else: return [] @property def dependencyRelationType(self): """ Returns the dependency relation type of the word """ if self._dependencyRelationType is not None: return self._dependencyRelationType.value elif self.graph is not None: for item in self.graph.objects( subject=self.uri, predicate=NIF.dependencyRelationType ): return item.value else: return None @property def nextWord(self): """ Returns the next word of the word in the sentence """ if self._nextWord is not None: return self._nextWord else: return None @property def previousWord(self): """ Returns the previous word of the word in the sentence """ if self._previousWord is not None: return self._previousWord else: return None
[docs] def set_nifsentence(self, nifsentence: NifSentence = None): """ Sets the sentence of which the word is a part """ self._nifsentence = nifsentence
[docs] def set_lemma(self, lemma: Union[URIRef, str] = None): """ Sets the lemma of the word (a string) """ if lemma is not None and lemma != "": if isinstance(lemma, URIRef): self._lemma = lemma else: self._lemma = Literal(lemma, datatype=XSD.string) else: self._lemma = None
[docs] def set_pos(self, pos: list = None): """ Sets the part-of-speech (pos) of the word (a rdflib.URIRef or a list of rdflib.URIRef) """ if pos is not None and pos != []: self._pos = pos else: self._pos = None
[docs] def set_morphofeats(self, morphofeats: list = None): """ Sets the morphological features of the word (a rdflib.URIRef or a list of rdflib.URIRef) """ if morphofeats is not None and morphofeats != []: self._morphofeats = morphofeats else: self._morphofeats = None
[docs] def set_dependency(self, dependency: list = None): """ Sets the dependency of the word (a list) """ if dependency is not None and dependency != []: self._dependency = dependency else: self._dependency = None
[docs] def set_dependencyRelationType(self, dependencyRelationType: str = None): """ Sets the dependencyRelationType of the word (a string) """ if dependencyRelationType is not None and dependencyRelationType != "": self._dependencyRelationType = Literal( dependencyRelationType, datatype=XSD.string ) else: self._dependencyRelationType = None
[docs] def set_nextWord(self, nextWord: NifWord = None): """ Sets the next word of the word in the sentence """ self._nextWord = nextWord
[docs] def set_previousWord(self, previousWord: NifWord = None): """ Sets the previous word of the word in the sentence """ self._previousWord = previousWord
[docs] def add_dependency(self, dependency: URIRef = None): """ Add a dependency to the list of dependencies of the word """ if self.dependency is not None and self.dependency != []: self.dependency.append(dependency) else: self.set_dependency([dependency])
[docs] def add_morphofeat(self, morphofeat: URIRef = None): """ Add a morphofeat to the list of morphofeats of the word """ if self.morphofeats is not None and self.morphofeats != []: self.morphofeats.append(morphofeat) else: self.set_morphofeats([morphofeat])
[docs] def add_pos(self, pos: URIRef = None): """ Add a pos to the list of pos of the word """ if self.pos is not None and self.pos != []: self.pos.append(pos) else: self.set_pos([pos])
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.Word) if self.nifsentence is not None: yield (self.uri, NIF.sentence, self._nifsentence.uri) for triple in super().triples(objects=objects): yield triple yield ( self.uri, NIF.anchorOf, Literal(self.anchorOf, datatype=XSD.string), ) if self.lemma is not None: if self.referenceContext.lexicon is not None: # prevent that uribaker converts this to underscore lemma = self._lemma.replace('"', "%22") lemma_uri = URIRef( iribaker.to_iri(str(self.referenceContext.lexicon) + lemma) ) yield (self.uri, NIF.lemma, lemma_uri) else: yield (self.uri, NIF.lemma, self._lemma) if self.pos is not None and self._pos != []: for pos in self._pos: yield (self.uri, NIF.pos, pos) if self._morphofeats is not None and self._morphofeats != []: for morphofeat in self._morphofeats: yield (self.uri, NIF.oliaLink, morphofeat) if self.nextWord is not None: yield (self.uri, NIF.nextWord, self.nextWord.uri) if self.previousWord is not None: yield (self.uri, NIF.previousWord, self.previousWord.uri) if self.dependencyRelationType is not None: yield ( self.uri, NIF.dependencyRelationType, self._dependencyRelationType, ) if self._dependency is not None: for dep in self._dependency: yield (self.uri, NIF.dependency, dep.uri)
[docs]class NifContextCollection(NifBase): """ A NIF Context Collection :param uri: the uri of the object :param hasContext: the list of contexts of the collection :param conformsTo: the NIF Ontology version """ def __init__( self, uri: Union[URIRef, str] = None, hasContext: list = None, conformsTo: Union[URIRef, str] = None, graph: Graph = None, ): super().__init__( uri=uri, ) self.set_graph(graph) self.set_hasContext(hasContext) self.set_conformsTo(conformsTo)
[docs] def set_graph(self, graph: Graph = None): self.graph = graph
@property def hasContext(self): if self._hasContext is None and self.graph is not None: self.load_contexts() if self._hasContext is not None: return list(self._hasContext.values()) else: return [] @property def contexts(self): return self.hasContext
[docs] def set_hasContext(self, hasContext: list = None): if hasContext is not None: self._hasContext = {context.uri: context for context in hasContext} else: self._hasContext = None
[docs] def load_contexts(self): contexts = [] for item in self.graph.objects(subject=self.uri, predicate=NIF.hasContext): contexts.append(NifContext(uri=item, graph=self.graph)) self.set_hasContext(contexts)
@property def conformsTo(self): if self._conformsTo is not None: return self._conformsTo elif self.graph is not None: for item in self.graph.objects(subject=self.uri, predicate=NIF.conformsTo): return item else: return None
[docs] def set_conformsTo(self, conformsTo: Union[URIRef, str]): if conformsTo is not None: if isinstance(conformsTo, str): self._conformsTo = URIRef(conformsTo) else: self._conformsTo = conformsTo else: self._conformsTo = URIRef(NIF_ONTOLOGY)
[docs] def add_context(self, context: NifContext = None): if context is not None: if self._hasContext is None: self._hasContext = {} self._hasContext[context.uri] = context
def __str__(self): return self.__repr__() def __repr__(self): s = f"(nif:ContextCollection) uri = {self.uri}\n" s += f" conformsTo : {self.conformsTo}\n" for context in self.hasContext[0:10]: s += f" hasContext : {context.uri}\n" if len(self.hasContext) > 10: s += " hasContext : ... \n" return s
[docs] def triples(self, objects=None): """ Generates all the triples """ if objects is None or any([isinstance(self, obj) for obj in objects]): if self.uri is not None: yield (self.uri, RDF.type, NIF.ContextCollection) if self.conformsTo is not None: yield (self.uri, DCTERMS.conformsTo, self.conformsTo) for context in self.hasContext: if self.uri is not None: yield (self.uri, NIF.hasContext, context.uri) for triple in context.triples(objects=objects): yield triple