nifigator package#

Submodules#

nifigator.converters module#

converters

nifigator.converters.nafConverter(collection_name: str | None = None, context_name: str | None = None, nafdocument: NafDocument | None = None, base_uri: str | None = None, base_prefix: str | None = None, URIScheme: str | None = None)[source]#

nifigator.lemongraph module#

class nifigator.lemongraph.LemonGraph(file: str | None = None, lexicon: Lexicon | None = None, URIScheme: str | None = None, store: Store | str = 'default', identifier: IdentifiedNode | str | None = None, namespace_manager: NamespaceManager | None = None, base: str | None = None, bind_namespaces: str = 'core')[source]#

Bases: Graph

An Ontolex-Lemon Graph

The constructor accepts the same arguments as a rdflib.Graph.

Parameters:
  • file – name of the file to read

  • lexicon – an Ontolex-Lemon Lexicon

extract_lexicon(lexicon_uri: URIRef | None = None)[source]#
open(file: str | None = None, lexicon: Lexicon | None = None)[source]#

Read data from multiple sources into current LemonGraph object.

Parameters:
  • file – name of the file to read

  • lexicon – a Ontolex-Lemon Lexicon

Returns:

None

query_rdf_type(rdf_type: URIRef | None = None)[source]#

nifigator.lemonobjects module#

class nifigator.lemonobjects.Component(uri: URIRef | None = None, correspondsTo: List[LexicalEntry] | None = None)[source]#

Bases: PhraseElement, LemonBase, LemonElement

A constituent element of a lexical entry. This may be a word in a multi-word lexical element or a constituent of a compound word

param: element: Denotes the lexical entry represented by the component

add_correspondsTo(correspondsTo: LexicalEntry | None = None)[source]#
property correspondsTo#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
set_correspondsTo(correspondsTo: List[LexicalEntry] | None = None)[source]#
string_rep(level: int = 0)[source]#
triples()[source]#
class nifigator.lemonobjects.ComponentList(uri: URIRef | None = None, components: list | None = None)[source]#

Bases: LemonBase

A node within a list of components. This should generally be a blank node, see rdf:List.

add_component(component=None)[source]#
property components#
set_components(components: list | None = None)[source]#
string_rep(level: int = 0)[source]#
triples()[source]#
class nifigator.lemonobjects.Form(uri: URIRef | None = None, formVariant: str | None = None, representations: List[str] | None = None, writtenReps: List[str] | None = None)[source]#

Bases: LemonBase, LemonElement

A given written or spoken realisation of a lexical entry.

Parameters:
  • formVariant

  • representation – A realisation of a given form

  • writtenRep – Gives the written representation of a given form

add_representation(representation: str | None = None)[source]#
add_writtenRep(writtenRep=None)[source]#
property formVariant#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
property representations#
set_formVariant(formVariant: str | None = None)[source]#
set_representations(representations: List[str] | None = None)[source]#
set_writtenReps(writtenReps: List[str] | None = None)[source]#
string_rep(level: int = 0)[source]#
triples()[source]#
property writtenReps#
class nifigator.lemonobjects.HasLanguage(uri: URIRef | None = None, language: str | None = None)[source]#

Bases: LemonBase

Structural element for all elements that can be tagged with a language.

property language#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
set_language(language: str | None = None)[source]#
triples()[source]#
class nifigator.lemonobjects.HasPatterns(uri: URIRef | None = None, MorphPatterns: list | None = None)[source]#

Bases: LemonBase, LemonElement

property MorphPatterns#
add_MorphPattern(MorphPattern: URIRef | None = None)[source]#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
set_MorphPatterns(MorphPatterns: list | None = None)[source]#
triples()[source]#
class nifigator.lemonobjects.LemonBase(uri: URIRef | str | None = None)[source]#

Bases: object

A Lemon Base

Parameters:

uri – the uri of the object

set_uri(uri: URIRef | str | None = None)[source]#

Sets the uri of the object. If the uri is a string then it is converted to an iri.

property uri#

Returns the uri of the object

class nifigator.lemonobjects.LemonElement(uri: URIRef | None = None, property: URIRef | None = None)[source]#

Bases: object

:param Denotes a lexical property of a lexical entry, form, component or MWE node. For the lexical entry this is assumed to be static properties e.g., part of speech and gender and for the others this is assumed to be specific properties e.g., case, number

property property#
set_property(property: URIRef | None = None)[source]#
set_uri(uri: URIRef | None = None)[source]#
triples()[source]#
property uri#
class nifigator.lemonobjects.LexicalEntry(uri: URIRef | None = None, language: str | None = None, patterns: URIRef | None = None, abstractForms: List[Form] | None = None, canonicalForm: Form | None = None, lexicalForms: List[Form] | None = None, otherForms: List[Form] | None = None, lexicalVariant: List[LexicalEntry] | None = None, constituents: list | None = None, decomposition: ComponentList | None = None, senses: List[LexicalSense] | None = None, label: str | None = None, partOfSpeechs: List[str] | None = None, termType: str | None = None, reliabilityCode: int | None = None)[source]#

Bases: HasLanguage, HasPatterns, LemonBase, LemonElement

An entry in the lexicon. This may be any morpheme, word, compound, phrase or clause that is included in the lexicon.

Parameters:
  • abstractForm – A representation of a lexical entry that should not be considered canonical. This is primarily from a linguistic view for non-realisable forms such as stems but may also include misspellings and other unusual forms

  • canonicalForm – The canonical (“dictionary”) form of the lexical entry. This can be used to indicate the “lemma” form of a lexical entry

  • lexicalForm – Denotes a written representation of a lexical entry

  • otherForm – A non-preferred (“non-dictionary”) representation of a lexical entry. This should be variant that is either a morphological variant, an abbreviation, short form or acronym

  • lexicalVariant – Indicates a non-semantic relationship between two lexical entries. E.g., a term is derived from another term, such as “lexical” and “lexicalize”

  • decomposition – Denotes a component of a lexical entry

  • phraseRoot – Indicates the head node of a phrase structure or dependency parse graph

  • sense – Indicates the sense of a lexical entry

  • label – the rdfs:label of the lexical entry

  • partOfSpeech – the partOfSpeech tag of the lexical entry

  • termType – the tbx:termType of the lexical entry

  • reliabilityCode – the tbx:reliabilityCode of the lexical entry

property abstractForms#
add_abstractForm(abstractForm: Form | None = None)[source]#
add_canonicalForm(canonicalForm: Form | None = None)[source]#
add_lexicalForm(lexicalForm: Form | None = None)[source]#
add_lexicalVariant(lexicalVariant: LexicalEntry | None = None)[source]#
add_otherForm(otherForm: Form | None = None)[source]#
add_partOfSpeech(partOfSpeech: str | None = None)[source]#
add_sense(sense: LexicalSense | None = None)[source]#
property canonicalForm#
property constituents#
property decomposition#
property label#
property lexicalForms#
property lexicalVariant#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
property otherForms#
property partOfSpeechs#
property phraseRoot#
property reliabilityCode#
property senses#
set_abstractForms(abstractForms: List[Form] | None = None)[source]#
set_canonicalForm(canonicalForm: Form | None = None)[source]#
set_constituents(constituents: list | None = None)[source]#
set_decomposition(decomposition: ComponentList | None = None)[source]#
set_label(label: str | None = None)[source]#
set_lexicalForms(lexicalForms: List[Form] | None = None)[source]#
set_lexicalVariant(lexicalVariant: List[LexicalEntry] | None = None)[source]#
set_otherForms(otherForms: List[Form] | None = None)[source]#
set_partOfSpeechs(partOfSpeechs: str | None = None)[source]#
set_reliabilityCode(reliabilityCode: int | None = None)[source]#
set_senses(senses: List[LexicalSense] | None = None)[source]#
set_termType(termType: str | None = None)[source]#
string_rep(level: int = 0)[source]#
property synBehavior#
property termType#
triples()[source]#
class nifigator.lemonobjects.LexicalSense(uri: URIRef | None = None, altRef: LexicalSense | None = None, broader: LexicalSense | None = None, narrower: LexicalSense | None = None, equivalent: LexicalSense | None = None, incompatible: LexicalSense | None = None, isReferenceOf: LexicalSense | None = None, isSenseOf: LexicalEntry | None = None, senseRelation: LexicalSense | None = None, subsense: LexicalSense | None = None)[source]#

Bases: LemonBase, LemonElement

Represents the intersection in meaning between the lexical entry and the ontology entity. This is used as the ontology entity and lexical entry may not be in one-to-one correspondence as such the existence of a sense between them states meerly that there are some cases when this lexical entry refer to the ontology entity and vica versa. Mapping elements can be used to further specify this relation.

Parameters:
  • altRef – The sense of a non-preferred but admissible lexicalization of a given ontology entity

  • broader – Denotes that one sense is broader than another. From a lexical point of view this means replacing one lexical entry with another generalizes the meaning of the phrase. From an ontological point of view this property makes not strong assertions. From a mapping point of view if the broader sense applies the narrower sense must also

  • narrower – Denotes that one sense is narrower than another. From a lexical point of view this means replacing one lexical entry with another specializes the meaning of the phrase. From an ontological point of view this property makes not strong assertions. From a mapping point of view if the broader sense applies the narrower sense must also

  • condition – Indicates an evaluable test, the is necessary for this sense to apply

  • context – Denotes the pragmatic or discursive context of a sense mapping or a constraint on the mapping by syntactic or semantic properites

  • definition – Indicates a natural language definition. Note there is a pseudo-node to allow for further description of the definition (e.g., source, creation date etc.). The value property should be used to indicate the string value of the definition.

  • equivalent – Indicates that two senses are equivalent. From a lexical point of view , this indicates that the lexical entries can be substituted for each other with no change in meaning. From an ontological point of view it means that the two references are not disjoint. From a mapping point of view it means if one mapping apply the other must necessarily apply

  • example

  • incompatible – Says that the two senses are disjoint. From a lexical point of view, this means substituting the lexical entries must change the meaning of the phrase. From an ontological point of view, this property is implied if both references are also disjoint, but does not imply disjointness, but non-equivalence of the references. For the mapping point of view there is not instance when both mappings are valid.

  • isA – Denotes that the single argument of a class predicate is represented in the lexicon by the given semantic argument. That is Class(?x) or ?x rdf:type Class

  • isReferenceOf – Indicate that a reference has a given sense

  • isSenseOf – Indicate that a sense is realised by the given lexical entry

  • objOfProp – Indicates the semantic argument which represents the objects (ranges) of the property referred to by this sense

  • reference – A reference to an external resource

  • propertyDomain – Indicates a restrictions on the domain of the property. That is, this sense only applies if the property the sense refers to has a subject in the class referred to by this property

  • propertyRange – Indicates a restrictions on the range of the property. That is, this sense only applies if the property the sense refers to has a object in the class referred to by this property

  • semArg – Denotes a semantic argument slot of a semantic unit

  • senseRelation – Denotes a relationship between senses

  • subsense – Indicates that the relation between a compound sense and its atomic subsenses

  • subjOfProp – Indicates the semantic argument which represents the subjects (domain) of the property referred to by this sense

property altRef#
property broader#
property equivalent#
property incompatible#
property isReferenceOf#
property isSenseOf#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
property narrower#
property senseRelation#
set_altRef(altRef: LexicalSense | None = None)[source]#
set_broader(broader: LexicalSense | None = None)[source]#
set_equivalent(equivalent: LexicalSense | None = None)[source]#
set_incompatible(incompatible: LexicalSense | None = None)[source]#
set_isReferenceOf(isReferenceOf: LexicalSense | None = None)[source]#
set_isSenseOf(isSenseOf: LexicalEntry | None = None)[source]#
set_narrower(narrower: LexicalSense | None = None)[source]#
set_senseRelation(senseRelation: LexicalSense | None = None)[source]#
set_subsense(subsense: LexicalSense | None = None)[source]#
string_rep(level: int = 0)[source]#
property subsense#
triples()[source]#
class nifigator.lemonobjects.Lexicon(uri: URIRef | None = None, entries: list | None = None, language: str | None = None, patterns: list | None = None)[source]#

Bases: HasLanguage, HasPatterns, LemonBase, LemonElement

The lexicon object. This object is specific to the given language and/or domain it describes.

Parameters:
  • entry – Indicates an entry in a lexicon

  • language – the language of the lexicon

  • pattern

add_entry(entry: LexicalEntry | None = None)[source]#
property entries#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
set_entries(entries: list | None = None)[source]#
string_rep(level: int = 0)[source]#
triples()[source]#
class nifigator.lemonobjects.Node(uri: URIRef | None = None, constituent: NodeConstituent | None = None)[source]#

Bases: LemonElement

A node in a phrase structure or dependency parse graph.

Parameters:
  • edge – Denotes the relation between a node in a multi-word expression structure and an edge

  • leaf – Denotes the component referred to by the lex (pre-terminal) of the phrase structure

  • separator – Indicates the graphical element used to seperate the subnodes of this phrase structure. It is generally recommended that you use a string value with the language tag used to indicate script, (i.e., using ISO-15924 codes, such as “Latn”), as orthographic features may change with script.

property constituent#
load(graph: Graph | None = None, uri: URIRef | None = None)[source]#
set_constituent(constituent: NodeConstituent | None = None)[source]#
string_rep(level: int = 0)[source]#
triples()[source]#
class nifigator.lemonobjects.NodeConstituent(uri: URIRef | None = None)[source]#

Bases: LemonElement

The class of constituents, that is types applied to nodes in a phrase structure graph.

class nifigator.lemonobjects.PhraseElement[source]#

Bases: LemonBase, LemonElement

A terminal node in a phrase structure graph, i.e., a realisable, lexical element.

nifigator.multisets module#

nifigator.multisets.containment_index(c1: set | None = None, c2: set | None = None)[source]#

Function to calculate the containment of set B in set A

nifigator.multisets.jaccard_index(c1: set | None = None, c2: set | None = None)[source]#

Function to calculate the Jaccard index of two sets

nifigator.multisets.merge_multiset(d: dict | None = None)[source]#

Function to calculate the multiset from a dict of phrases

nifigator.nafobjects module#

class nifigator.nafobjects.NafBase[source]#

Bases: object

get_attributes(data=None, namespace=None, exclude=[])[source]#
layer(tree: _ElementTree | None = None, layer_tag: str | None = None)[source]#
subelement(element: _Element | None = None, tag: str | None = None, data={}, attributes_to_ignore: list = [])[source]#
class nifigator.nafobjects.NafDepsLayer[source]#

Bases: object

parse(tree: _ElementTree | None = None)[source]#
write(tree: _ElementTree | None = None)[source]#
class nifigator.nafobjects.NafEntitiesLayer[source]#

Bases: object

parse(tree: _ElementTree | None = None)[source]#
write(tree: _ElementTree | None = None)[source]#
class nifigator.nafobjects.NafHeaderLayer(metadata: dict | None = None)[source]#

Bases: NafBase

property metadata#
parse(tree: _ElementTree | None = None)[source]#
set_metadata(metadata)[source]#
write(tree: _ElementTree | None = None)[source]#
class nifigator.nafobjects.NafRawLayer(raw: str | None = None)[source]#

Bases: NafBase

parse(tree: _ElementTree | None = None)[source]#

Parse the raw layer from an xml tree

property raw#

Returns the raw text of the document

set_raw(raw: str | None = None)[source]#

Sets the raw text of the document

write(tree: _ElementTree | None = None)[source]#

Add the raw layer to the xml tree

class nifigator.nafobjects.NafTermsLayer[source]#

Bases: NafBase

parse(tree: _ElementTree | None = None)[source]#
write(tree: _ElementTree | None = None)[source]#
class nifigator.nafobjects.NafTextLayer(wordforms: List[WfElement] | None = None)[source]#

Bases: NafBase

parse(tree: _ElementTree | None = None)[source]#
set_wordforms(wordforms: List[WfElement])[source]#
property wordforms#
write(tree: _ElementTree | None = None)[source]#
nifigator.nafobjects.WordformElement#

alias of WfElement

nifigator.nifgraph module#

class nifigator.nifgraph.NifGraph(file: str | None = None, nafdocument: NafDocument | None = None, collection: NifContextCollection | None = None, URIScheme: str | None = None, store: Store | str = 'default', identifier: IdentifiedNode | str | None = None, namespace_manager: NamespaceManager | None = None, base: str | None = None, bind_namespaces: str = 'core')[source]#

Bases: Graph

An NIF Graph

The constructor accepts the same arguments as a rdflib.Graph.

Parameters:
  • file – name of the file to read

  • nafdocument – an xml file in NLP Annotation Format

  • collection – an NifContextCollection

property catalog#
property collection: NifContextCollection#

ContextCollection from the NifGraph.

return the first nif:ContextCollection in the graph

Type:

This property constructs and returns the first nif

property collections: list#

ContextCollection from the NifGraph.

return list of nif:ContextCollection in the graph

Type:

This property constructs and returns a list of nif

context_graph(uri: URIRef | None = None)[source]#
property contexts: list#

Context from the NifGraph.

return list of nif:Context in the graph

Type:

This property constructs and returns a nif

get(uri: URIRef | None = None)[source]#
property lexicon#
open(file: str | None = None, nafdocument: NafDocument | None = None, collection: NifContextCollection | None = None)[source]#

Read data from multiple sources into current NifGraph object.

Parameters:
  • file – name of the file to read

  • nafdocument – an xml file in NLP Annotation Format

  • collection – an NifContextCollection

Returns:

None

nifigator.nifobjects module#

class nifigator.nifobjects.NifBase(uri: URIRef | str | None = None)[source]#

Bases: object

A NIF Base

Parameters:

uri – the uri of the object

set_uri(uri: URIRef | str | None = None)[source]#

Sets the uri of the object. If the uri is a string then it is converted to an iri.

property uri#

Returns the uri of the object

class nifigator.nifobjects.NifContext(URIScheme: str | None = None, base_uri: URIRef | None = None, uri: URIRef | None = None, sourceUrl: URIRef | None = None, predLang: URIRef | None = None, isString: Literal | str | None = None, metadata: dict | None = None, lexicon: URIRef | None = None, graph: Graph | None = None)[source]#

Bases: NifString

A NIF Context

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • sourceUrl – the source url of the context

  • predLang – the predominant language of the context

  • isString – the string of the context

  • metadata – a list of URIRefs with metadata

add_sentence(sentence: NifSentence | None = None)[source]#

Adds a sentences to the context (a NifSentence)

extract_sentences(forced_sentence_split_characters: list = [])[source]#

Tokenize the string of the context and add sentences to the context

property firstPage#

Returns the first page of the context.

property firstParagraph#

Returns the first paragraph of the context.

property firstPhrase#

Returns the first phrase of the context.

property firstSentence#

Returns the first sentence of the context.

property isString#

Returns the isString of the context

property lastPage#

Returns the last page of the context.

property lastParagraph#

Returns the last paragraph of the context.

property lastPhrase#

Returns the last phrase of the context.

property lastSentence#

Returns the last sentence of the context.

property lexicon#

Returns the lexicon

load_from_dict(stanza_dict: list | None = None)[source]#

Load a context from stanza dictionary

load_pages()[source]#
load_paragraphs()[source]#
load_phrases()[source]#
load_sentences()[source]#
property metadata#

Returns the metadata of the context

property pages#

Returns all pages in the context as a list.

property paragraphs#

Returns all paragraphs in the context as a list.

property phrases#

Returns all phrases in the context as a list.

property predLang#

Returns the predLang of the context

property sentences#

Returns all sentences in the context as a list.

set_Pages(pages: list | None = None)[source]#

Sets the pages of the context (a list of NifPage)

set_Paragraphs(paragraphs: list | None = None)[source]#

Sets the paragraphs of the context (a list of NifParagraph)

set_Phrases(phrases: list | None = None)[source]#

Sets the phrases of the context (a list of NifPhrases)

set_Sentences(sentences: list | None = None)[source]#

Sets the sentences of the context (a list of NifSentence)

set_isString(isString: Literal | str | None = None)[source]#

Sets the string of the context (rdflib.Literal or string)

set_lexicon(lexicon: URIRef | None = None)[source]#

Sets the lexicon base uri for lemmas

set_metadata(metadata: dict | None = None)[source]#

Sets the metadata of the context (a dict of predicates and objects)

set_predLang(predLang: URIRef | str | None = None)[source]#

Sets the predominant language of the context

set_sourceUrl(sourceUrl: URIRef | None = None)[source]#

Sets the sourceUrl of the context

property sourceUrl#

Returns the sourceUrl of the context

triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifContextCollection(uri: URIRef | str | None = None, hasContext: list | None = None, conformsTo: URIRef | str | None = None, graph: Graph | None = None)[source]#

Bases: NifBase

A NIF Context Collection

Parameters:
  • uri – the uri of the object

  • hasContext – the list of contexts of the collection

  • conformsTo – the NIF Ontology version

add_context(context: NifContext | None = None)[source]#
property conformsTo#
property contexts#
property hasContext#
load_contexts()[source]#
set_conformsTo(conformsTo: URIRef | str)[source]#
set_graph(graph: Graph | None = None)[source]#
set_hasContext(hasContext: list | None = None)[source]#
triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifPage(URIScheme: str | None = None, base_uri: URIRef | None = None, uri: URIRef | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, pageNumber: int | None = None, referenceContext: NifContext | None = None, graph: Graph | None = None)[source]#

Bases: NifStructure

A NIF Page

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • pageNumber – the page number of the object

  • referenceContext – the context to which the string refers

property pageNumber#
set_pageNumber(pageNumber: int | None = None)[source]#
triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifParagraph(URIScheme: str | None = None, base_uri: URIRef | None = None, uri: URIRef | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, referenceContext: NifContext | None = None, graph: Graph | None = None)[source]#

Bases: NifStructure

A NIF Paragraph

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • referenceContext – the context to which the string refers

triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifPhrase(base_uri: URIRef | None = None, uri: URIRef | None = None, URIScheme: str | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, nifsentence: NifSentence | None = None, referenceContext: NifContext | None = None, taIdentRef: URIRef | None = None, taClassRef: URIRef | None = None, taConfidence: Literal | float | None = None, PhraseType: str | None = None, nextPhrase: NifPhrase | None = None, previousPhrase: NifPhrase | None = None, graph: Graph | None = None)[source]#

Bases: NifStructure

A NIF Phrase

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • nifsentence – the sentence of the word

  • referenceContext – the context to which the string refers

  • taIdentRef – text analysis identifier reference

  • taClassRef – text analysis class reference

  • taConfidence – confidence of the annotation

  • PhraseType – type of phrase (EntityOccurrence, TermOccurrence)

property PhraseType#

Returns the phrasetype (entity or term occurrence)

property nextPhrase#

Returns the next phrase

property nifsentence#

Returns the sentence to which the word belongs

property previousPhrase#

Returns the previous phrase

set_PhraseType(PhraseType: str | None = None)[source]#

Sets the phrase type (EntityOccurrence or TermOccurrence)

set_nifsentence(nifsentence: NifSentence | None = None)[source]#

Sets the sentence of which the word is a part

set_taClassRef(taClassRef: URIRef | str | None = None)[source]#

Sets the text analysis class reference (as a rdflib.URIRef)

set_taConfidence(taConfidence: Literal | float | None = None)[source]#

Sets the text analysis confidence (float)

set_taIdentRef(taIdentRef: URIRef | str | None = None)[source]#

Sets the text analysis identifier reference (as a rdflib.URIRef)

property taClassRef#

Returns text analysis class reference

property taConfidence#

Returns text analysis confidence

property taIdentRef#

Returns text analysis identifier reference

triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifSentence(base_uri: URIRef | None = None, uri: URIRef | None = None, URIScheme: str | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, referenceContext: NifContext | None = None, pages: List[NifPage] | None = None, nextSentence: URIRef | str | None = None, previousSentence: URIRef | str | None = None, words: List[NifWord | URIRef] | None = None, graph: Graph | None = None)[source]#

Bases: NifStructure

A NIF Sentence

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • referenceContext – the context to which the string refers

  • nifpages – the pages where the sentence occurs

  • nextSentence – the next sentence in the context

  • previousSentence – the previous sentence in the context

add_page(page: NifPage | None = None)[source]#
add_word(word: NifWord | None = None)[source]#
property firstWord#
property lastWord#
property lemmas#
property nextSentence#
property pages#
property previousSentence#
set_Words(words: list | None = None)[source]#
set_nextSentence(nextSentence: NifSentence | None = None)[source]#
set_pages(pages: List[NifPage] | None = None)[source]#
set_previousSentence(previousSentence: NifSentence | None = None)[source]#
triples(objects=None)[source]#

Generates all the triples

property words#
class nifigator.nifobjects.NifString(URIScheme: str | None = None, base_uri: URIRef | None = None, uri: URIRef | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, referenceContext: NifContext | None = None, graph: Graph | None = None)[source]#

Bases: NifBase

A NIF String

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • referenceContext – the context to which the string refers

property URIScheme#

Returns the URIScheme

property anchorOf#

Returns the string of the object as a str. The anchorOf is not store in the object but extracted from the referenceContext.

property anchorOf_no_accents#

Returns the string without accents of the object as a str.

property anchorOf_no_diacritics#

Returns the string without diacritics of the object as a str.

property beginIndex#

Returns the start index of the context string as an int.

property endIndex#

Returns the end index of the context string as an int.

property referenceContext#

Returns the context of the current object

set_URIScheme(URIScheme: str | None = None)[source]#

Sets the URIScheme of the object

set_anchorOf(anchorOf: Literal | str | None = None)[source]#

The anchorOf should be consistent with the string in the referenceContext, otherwise an error is logged.

set_base_uri(base_uri: URIRef | None = None)[source]#

Sets the base uri of the object

set_beginIndex(beginIndex: Literal | int | None = None)[source]#

Sets the start of the index of the string. The type of beginIndex can be a Literal or an int. If the type is an int then it is converted to a Literal.

set_endIndex(endIndex: Literal | int | None = None)[source]#

Sets the end of the index of the string. The type of endIndex can be a Literal or an int. If the type is an int then it is converted to a Literal.

set_graph(graph: Graph | None = None)[source]#
set_referenceContext(referenceContext: NifContext | None = None)[source]#

Sets the referenceContext of the object.

set_uri(uri: URIRef | None = None)[source]#

Sets the uri of the object

triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifStructure(base_uri: URIRef | None = None, uri: URIRef | None = None, URIScheme: str | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, referenceContext: NifContext | None = None, graph: Graph | None = None)[source]#

Bases: NifString

A NIF Structure

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • referenceContext – the context to which the string refers

triples(objects=None)[source]#

Generates all the triples

class nifigator.nifobjects.NifWord(URIScheme: str | None = None, base_uri: URIRef | None = None, uri: URIRef | None = None, beginIndex: Literal | int | None = None, endIndex: Literal | int | None = None, referenceContext: NifContext | None = None, nifsentence: NifSentence | None = None, lemma: URIRef | str | None = None, pos: list | None = None, morphofeats: list | None = None, dependency: list | None = None, dependencyRelationType: str | None = None, nextWord: str | None = None, previousWord: str | None = None, graph: Graph | None = None)[source]#

Bases: NifStructure

A NIF Word

Parameters:
  • URIScheme – the URIScheme of the object

  • base_uri – the uri from which the uri of the object is derived

  • uri – the uri of the object

  • beginIndex – the start index in the context string

  • endIndex – the end index in the context string

  • referenceContext – the context to which the string refers

  • nifsentence – the sentence of the word

  • lemma – the lemma of the word

  • pos – the part-of-speech tags (a list)

  • morphofeats – the morphological features (a list)

  • dependency – dependency relations of the word (a list)

  • dependencyRelationType – the type of dependency relation of the word

  • nextWord – the next word in the sentence

  • previousWord – the previous word in the sentence

add_dependency(dependency: URIRef | None = None)[source]#

Add a dependency to the list of dependencies of the word

add_morphofeat(morphofeat: URIRef | None = None)[source]#

Add a morphofeat to the list of morphofeats of the word

add_pos(pos: URIRef | None = None)[source]#

Add a pos to the list of pos of the word

property dependency#

Returns the dependencies of the word as a list

property dependencyRelationType#

Returns the dependency relation type of the word

property lemma#

Returns the lemma of the word

property morphofeats#

Returns the morphological features of the word as a list

property nextWord#

Returns the next word of the word in the sentence

property nifsentence#

Returns the sentence to which the word belongs

property pos#

Returns the part-of-speech (pos) of the word

property previousWord#

Returns the previous word of the word in the sentence

set_dependency(dependency: list | None = None)[source]#

Sets the dependency of the word (a list)

set_dependencyRelationType(dependencyRelationType: str | None = None)[source]#

Sets the dependencyRelationType of the word (a string)

set_lemma(lemma: URIRef | str | None = None)[source]#

Sets the lemma of the word (a string)

set_morphofeats(morphofeats: list | None = None)[source]#

Sets the morphological features of the word (a rdflib.URIRef or a list of rdflib.URIRef)

set_nextWord(nextWord: NifWord | None = None)[source]#

Sets the next word of the word in the sentence

set_nifsentence(nifsentence: NifSentence | None = None)[source]#

Sets the sentence of which the word is a part

set_pos(pos: list | None = None)[source]#

Sets the part-of-speech (pos) of the word (a rdflib.URIRef or a list of rdflib.URIRef)

set_previousWord(previousWord: NifWord | None = None)[source]#

Sets the previous word of the word in the sentence

triples(objects=None)[source]#

Generates all the triples

nifigator.nifvecobjects module#

class nifigator.nifvecobjects.NifVectorGraph(nif_graph: NifGraph | None = None, context_uris: list | None = None, documents: list | None = None, base_uri: Namespace = Namespace('https://mangosaurus.eu/rdf-data/nifvec-data/'), lang: str | None = None, params: dict = {}, store: Store | str = 'default', identifier: IdentifiedNode | str | None = None, namespace_manager: NamespaceManager | None = None, base: str | None = None, bind_namespaces: str = 'core')[source]#

Bases: NifGraph

A NIF Vector graph

Parameters:
  • (NifGraph) (nif_graph) – the graph from which to construct the NIF Vector graph (optional)

  • (list) (documents) – the context uris of the contexts used with the nif_graph to construct the NIF Vector graph (optional)

  • (list) – the documents from which to construct the NIF Vector graph (optional)

  • (Namespace) (base_uri) – the namespace of the nifvec data

  • (str) (lang) – the language of the nifvec data

  • (dict) (params) – parameters for constructing the NIF Vector graph

compact()[source]#

This function compacts the NifVector graph by replacing all hasCount triples by one sum hasCount triple

context_phrases(context: tuple | None = None, left: str | None = None, right: str | None = None, topn: int = 15)[source]#

Function that returns the phrases of a context

dict_phrases_contexts(word: str | None = None, topn: int = 7, topcontexts: int = 10)[source]#
extract_rdf_type(rdf_type: str | None = None, topn: int | None = None)[source]#
find_otherForms(phrase: str | None = None, phrase_uri: URIRef | None = None)[source]#
generate_triples(phrases: dict = {}, contexts: dict = {})[source]#

Function to create all triples of a set of documents

Parameters:
  • phrases – dictionary of all phrases to be stored

  • contexts – dictionary of all contexts to be stored

load_vectors(documents: dict | None = None, vectors: dict | None = None, topn: int = 15, includePhraseVectors: bool = True, includeContextVectors: bool = False, includeOtherForms: bool = False)[source]#

Function to retrieve the vectors of phrases and context of a set of documents

most_similar(phrase: str | None = None, phrase_uri: URIRef | None = None, context: str | None = None, context_uri: URIRef | None = None, contexts: list | None = None, contexts_uris: list | None = None, topn: int = 15, topcontexts: int = 25, topphrases: int = 25)[source]#

Function that returns most similar phrases of a phrase

Parameters:
  • phrase – the phrase from which to derive similar phrases (as a string)

  • phrase_uri – the phrase from which to derive similar phrases (as a uri)

  • context – the context to take into account for deriving similar phrases (as a string)

  • context_uri – the context to take into account for deriving similar phrases (as a uri)

  • contexts – use list of contexts to filter

  • contexts_uris – filter contexts

  • topn – restrict output to topn (default = 15)

  • topcontexts – number of similar contexts to use when using phrase or phrase_uri

  • topphrases – number of similar phrases to use when using context or context_uri

phrase_contexts(phrase: str | None = None, phrase_uri: URIRef | None = None, left: str | None = None, right: str | None = None, topn: int = 15)[source]#

Function that returns the contexts of a phrase

Parameters:
  • phrase – the phrase from which to derive the contexts (as a string)

  • phrase_uri – the phrase from which to derive the contexts (as a uri)

  • left – the left side of the context (optional, as a string)

  • right – the right side of the context (optional, as a string)

  • topn – restrict output to topn (default = 15)

phrases(topn: int | None = None)[source]#

Returns phrases with their counts in the graph

store_triples(phrases: dict = {}, contexts: dict = {})[source]#

Function to store the triples from a document set into the NifVector graph.

The triples are loaded in batches into the NifVector graph, to prevent the number of SPARQL updates.

Parameters:
  • phrases – dictionary of all phrases to be stored

  • contexts – dictionary of all contexts to be stored

nifigator.nifvecobjects.document_vector(documents: dict | None = None, vectors: dict | None = None, includePhraseVectors: bool = True, includeContextVectors: bool = False, topn: int = 15, merge_dict: bool = False, params: dict | None = None)[source]#

extract the phrases of a string and create dict of phrases with their contexts

nifigator.nifvecobjects.generate_document_contexts(init_phrases: dict | None = None, documents: dict | None = None, params: dict = {})[source]#
nifigator.nifvecobjects.generate_document_phrases(documents: dict | None = None, params: dict = {})[source]#

This function generates all phrases in the documents

Parameters:
  • documents – a dict with context.uri as keys and context.isString as values

  • params – a dict with parameters

nifigator.nifvecobjects.generate_sentence_phrases(sentences: list | None = None, params: dict = {})[source]#

Generator for all phrases and their location in the sentences

nifigator.nifvecobjects.preprocess(document: str | None = None, params: dict = {})[source]#

nifigator.pdfparser module#

class nifigator.pdfparser.PDFDocument(join_hyphenated_words: bool = True, ignore_control_characters: str = '[\x00-\x08\x0b-\x0c\x0e-\x1f]')[source]#

Bases: object

getstream() bytes[source]#

Function to stream the PDFDocument in xml Returns: Bytesstream of the PDFDocument in xml

open(input: str | bytes)[source]#

Function to open a PDFDocument in xml :param input: the location of the PDFDocument in xml to be opened or a bytes object containing the file content

property page_offsets#

Property to extract page offsets from PDFDocument Return: list of PDF_offset elements (named tuples)

parse(file: str | ~_io.BytesIO | None = None, codec: str = 'utf-8', password: str = '', laparams: ~pdfminer.layout.LAParams = <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>)[source]#

Function to convert pdf to xml or text

Parameters:
  • file – location or stream of the file to be converted

  • codec – codec to be used to conversion

  • password – password to be used for conversion

  • laparams – laparams for the pdfminer.six parser

  • join_hyphenated_words – Join ‘hyhen-n ated wor- nds’ to ‘hyphenated words’

Returns:

property text#

Property to extract text from PDFDocument Return: str

write(output: str) None[source]#

Function to write an PDFDocument in xml :param output: the location of the PDFDocument in xml to be stored

nifigator.search module#

nifigator.utils module#

nifigator.utils.align_stanza_dict_offsets(stanza_dict: list | None = None, sentences: list | None = None)[source]#

Function to align the stanza dict offsets with the offsets from the tokenizer

Parameters:
  • stanza_dict – the output dict from the Stanza pipeline

  • sentences – the output of the tokenizer

nifigator.utils.delete_accents(text: str | None = None, lang: str = 'en')[source]#

Function to delete accents from a string

Parameters:
  • text – the string from which the accents should be deleted

  • lang – the language of the text in the string

nifigator.utils.delete_diacritics(text: str | None = None, lang: str = 'en')[source]#

Function to delete diacritics from a string

Parameters:
  • text – the string from which the diacritics should be deleted

  • lang – the language of the text in the string

nifigator.utils.generate_uuid(uri: str | None = None, prefix: str = 'nif-')[source]#

Function to generate the uuid for nif

Parameters:
  • uri – the uri from which the uuid should be generated

  • prefix – the prefix of the uuid, default = “nif-”

nifigator.utils.load_dtd(dtd_url: str) DTD[source]#

Utility function to load the dtd

Parameters:

dtd_url – the location of the dtd file

Returns:

the dtd object to be used for validation

Return type:

etree.DTD

nifigator.utils.natural_sort(elements: list | None = None)[source]#

Function to sort a list of strings with numbers

Parameters:

elements – the list to be sorted

nifigator.utils.prepare_comment_text(text: str) str[source]#

Function to prepare comment text for xml

Parameters:

text – comment to be converted to xml comment

Returns:

converted comment text

Return type:

str

nifigator.utils.replace_escape_characters(text: str | None = None)[source]#

Function to replace espace characters by spaces (maintaining exact character locations)

Parameters:

text – the text where escape characters should be replaces

nifigator.utils.time_in_correct_format(datetime_obj: datetime) str[source]#

Function that returns the current time (UTC)

Parameters:

datetime_obj – the input to be converted

Returns:

the time in correct format

Return type:

str

nifigator.utils.to_iri(s: str = '')[source]#
nifigator.utils.tokenize_text(text: list | None = None, forced_sentence_split_characters: list = [])[source]#
nifigator.utils.tokenizer(text: str | None = None)[source]#

Function to create list of sentences with list of words with text and start_char and end_char of each word

Parameters:

text – the text to be tokenized

Module contents#