Using Ontolex-Lemon data#

Using Ontolex-Lemon with NIF data#

Nifigator includes functionality to work with the Lexicon Model for Ontologies (lemon), developed by the Ontology Lexicon community group (OntoLex).

We will show how to create a lexicon from NIF data and how to use an existing Ontolex-Lemon termbase to search in NIF data.

Open a graph with NIF data#

We read the NIF data that we created earlier.

from nifigator import NifGraph, generate_uuid

original_uri = "https://www.dnb.nl/media/4kobi4vf/dnb-annual-report-2021.pdf"
uri = "https://dnb.nl/rdf-data/"+generate_uuid(uri=original_uri)

# create a NifGraph from this collection and serialize it 
nif_graph = NifGraph().parse(
    "..//data//"+generate_uuid(uri=original_uri)+".ttl", format="turtle"
)

Then we create a lexicon from NIF data. First we extract all words with lemma and part to speech tags.

lexicon = nif_graph.lexicon

The lexicon now contains a lexicon for all languages used in the NIF data. In our case we only have an English lexicon.

lexicon['en']
(ontolex:Lexicon) uri = <https://mangosaurus.eu/rdf-data/lexicon/en>
  language : en
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/DNB>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/in>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/the>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/of>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/money>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/,>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/and>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/other>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/financial>
  entry : <https://mangosaurus.eu/rdf-data/lexicon/en/for>
  entry : ...

The lemon lexicon consists of lexical entries that can be retrieved with

From this a lexicon graph can be made.

from nifigator import LemonGraph

lexicon_graph = LemonGraph(lexicon=lexicon)
print("Number of triples: "+str(len(lexicon_graph)))

This shows:

Number of triples: 34298
# store graph to a file
import os
file = os.path.join("..//data//", generate_uuid(uri=original_uri)+"_lexicon.ttl")
lexicon_graph.serialize(file, format="turtle")

Using an existing Ontolex-Lemon termbase#

Below, we will give same examples based on the Solvency 2 termbase constructed from the Solvency 2 XBRL taxonomy.

Open the Ontolex-Lemon termbase and add to graph

from rdflib import Graph

TAXO_NAME = "EIOPA_SolvencyII_XBRL_Taxonomy_2.6.0_PWD_with_External_Files"

termbase = Graph().parse(
    "P://projects//rdf-data//termbases//"+TAXO_NAME+".ttl", format="turtle"
)

The termbase can be combined with the nif data and we bind the prefixes to the nif graph.

# combine the termbase with the NIF data
nif_graph += termbase

# bind namespaces
from rdflib import Namespace, namespace
nif_graph.bind("tbx", Namespace("http://tbx2rdf.lider-project.eu/tbx#"))
nif_graph.bind("ontolex", Namespace("http://www.w3.org/ns/lemon/ontolex#"))
nif_graph.bind("lexinfo", Namespace("http://www.lexinfo.net/ontology/3.0/lexinfo#"))
nif_graph.bind("decomp", Namespace("http://www.w3.org/ns/lemon/decomp#"))
nif_graph.bind("skos", namespace.SKOS)

Running SPARQL queries#

Some examples of SPARQL queries:

# All altLabels of the concept Risk margin

q = """
SELECT ?altlabel
WHERE {
    ?concept skos:prefLabel "Risk margin"@en .
    ?concept skos:altLabel ?altlabel .
}
"""
# execute the query
results = list(nif_graph.query(q))

print("Number of hits: "+str(len(results)))

# print the results
for result in results[0:5]:
    print((result))

This returns all locations in the reporting framework that has label ‘risk margin’.

Number of hits: 59
(rdflib.term.Literal('SE.02.01.18.01,R0550', lang='en'),)
(rdflib.term.Literal('SR.02.01.07.01,R0640', lang='en'),)
(rdflib.term.Literal('S.02.01.08.01,R0720', lang='en'),)
(rdflib.term.Literal('S.02.01.08.01,R0590', lang='en'),)
(rdflib.term.Literal('SE.02.01.16.01,R0720', lang='en'),)

Now we combine the termbase data and the nif data. The termbase contains the labels and the template codes of all rows and columns. Given a specific datapoint we can look for the prefLabel (the textual representation of the row or column) and look for the lexical entry of that concept in the nif data.

# all occurrences of concepts that have altLabel "S.26.01.01.01,C0030"

q = """
SELECT ?r ?word ?concept
WHERE {
    ?word nif:lemma ?t .
    ?entry ontolex:canonicalForm [ rdfs:label ?t ; ontolex:writtenRep ?r] .
    ?entry ontolex:sense [ ontolex:reference ?concept ] .
    ?concept skos:altLabel "S.26.01.01.01,C0030"@en .
}
"""
# execute the query
results = list(nif_graph.query(q))

print("Number of hits: "+str(len(results)))

# print the results
for result in results[0:5]:
    print((result[0].value, result[1:]))

This returns:

Number of hits: 89
('liability', (rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_266314_266325'), rdflib.term.URIRef('http://eiopa.europa.eu/xbrl/s2md/fws/solvency/solvency2/2021-07-15/tab/s.26.01.01.01#s2md_c5730')))
('liability', (rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_272070_272079'), rdflib.term.URIRef('http://eiopa.europa.eu/xbrl/s2md/fws/solvency/solvency2/2021-07-15/tab/s.26.01.01.01#s2md_c5730')))
('liability', (rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_273065_273074'), rdflib.term.URIRef('http://eiopa.europa.eu/xbrl/s2md/fws/solvency/solvency2/2021-07-15/tab/s.26.01.01.01#s2md_c5730')))
('liability', (rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_276241_276252'), rdflib.term.URIRef('http://eiopa.europa.eu/xbrl/s2md/fws/solvency/solvency2/2021-07-15/tab/s.26.01.01.01#s2md_c5730')))
('liability', (rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_289288_289297'), rdflib.term.URIRef('http://eiopa.europa.eu/xbrl/s2md/fws/solvency/solvency2/2021-07-15/tab/s.26.01.01.01#s2md_c5730')))

If we want to include the pagenumbers of the hits we can use the following query.

# all occurrences of concepts that have altLabel "S.26.01.01.01,C0030"
# including the pagenumber

q = """
SELECT ?r ?word ?pagenumber ?concept
WHERE {
    ?word nif:lemma ?t .
    ?entry ontolex:canonicalForm [ rdfs:label ?t ; ontolex:writtenRep ?r] .
    ?entry ontolex:sense [ ontolex:reference ?concept ] .
    ?concept skos:altLabel "S.26.01.01.01,C0030"@en .

    ?word nif:beginIndex ?word_beginIndex .
    ?word nif:endIndex ?word_endIndex .
    ?page rdf:type nif:Page .
    ?page nif:pageNumber ?pagenumber .
    ?page nif:beginIndex ?page_beginIndex .
    FILTER( ?page_beginIndex <= ?word_beginIndex ) .
    ?page nif:endIndex ?page_endIndex .
    FILTER( ?page_endIndex >= ?word_endIndex ) .
}
"""
# execute the query
results = nif_graph.query(q)

print("Number of hits: "+str(len(results)))

for result in list(results)[0:10]:
    print((result[0].value, result[1], result[2].value))

This gives:

Number of hits: 89
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_161209_161220'), 66)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_160848_160859'), 66)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_168715_168726'), 69)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_261149_261160'), 114)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_260373_260384'), 114)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_260676_260687'), 114)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_260742_260753'), 114)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_260865_260876'), 114)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_260925_260936'), 114)
('liability', rdflib.term.URIRef('https://dnb.nl/rdf-data/nif-5282967702ae37d486ad338b9771ca8f&nif=word_260993_261004'), 114)

Now we check for all concepts in the termbase in the text:

# All concepts in the text

q = """
SELECT distinct ?concept
WHERE {
    ?word nif:lemma ?t .
    ?entry ontolex:canonicalForm [ rdfs:label ?t ; ontolex:writtenRep ?r] .
    ?entry ontolex:sense [ ontolex:reference ?concept ] .
}
"""
# execute the query
results = list(nif_graph.query(q))

print("Number of hits: "+str(len(results)))

This returns:

Number of hits: 1259

Sometimes terms consists of multiwords:

# All occurrence of 'dutch financial institution'

def find_term(s: str=""):
    
    words = s.split(" ")
    q = "SELECT ?s ?e\nWHERE {\n"
    q += "    ?w a nif:Word . \n"
    q += "    ?w nif:beginIndex ?s . \n"
    for idx, word in enumerate(words):
        q += '    ?w '+'nif:nextWord/'*(idx)+'nif:lemma "'+word+'"^^xsd:string .\n'
    q += '    ?w '+'nif:nextWord/'*(len(words)-1)+'nif:endIndex ?e .\n'    
    q += "}"
    q += "order by ?s"
#     print(q)
    return q

#  execute the query
results = list(nif_graph.query(find_term("dutch financial institution")))
print("Number of hits: "+str(len(results))+"\n")

for result in results:
    print(str(result[0].value) + ":"+str(result[1].value))
Number of hits: 8

40579:40607
47488:47516
58193:58221
115913:115941
116187:116217
116925:116953
203374:203403
322642:322669