Using a SPARQL endpoint#
Connecting to a (local) SPARQL endpoint#
You can use an existing SPARQL endpoint in the following way.
from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as default
# Connect to triplestore.
store = SPARQLUpdateStore()
query_endpoint = 'http://localhost:3030/nifigator/sparql'
update_endpoint = 'http://localhost:3030/nifigator/update'
store.open((query_endpoint, update_endpoint))
Then open a NifGraph in the same way as a rdflib.Graph
from nifigator import NifGraph
# Open a graph in the open store and set identifier to default graph ID.
graph = NifGraph(store=store, identifier=default)
We can then check the number of triples in the store
# count the number of triples in the store
print("Number of triples: "+str(len(graph)))
Number of triples: 1081392
To check the contexts in the graph you can use the catalog property. This property return a Pandas DataFrame with the context uris (in the index) with collection uri and the metadata (from DC and DCTERMS) available.
# get the catalog with all contexts within the graph
catalog = graph.catalog
catalog
It is sometimes necessary to compact the database. You can do that with the following command
curl -XPOST http://localhost:3030/$/compact/nifigator
Running SPARQL queries#
The total number of words in the collection#
# define the query for the total number of words
q = """
SELECT (count(?s) as ?num) WHERE {
SERVICE <http://localhost:3030/nifigator/sparql> {
?s rdf:type nif:Word .
}
}
"""
# execute the query
results = graph.query(q)
# print the results
for result in results:
print(result[0].value)
This returns
68070
The frequency of words per context#
# query for the frequency of words per context
q = """
SELECT ?w (count(?w) as ?num) WHERE {
SERVICE <http://localhost:3030/nifigator/sparql> {
?s rdf:type nif:Word .
?s nif:anchorOf ?w .
?s nif:referenceContext ?c .
}
}
GROUP BY ?w
ORDER BY DESC(?num)
LIMIT 10
"""
# execute the query
results = graph.query(q)
# print the results
for result in results:
print((result[0].value, result[1].value))
This returns
('the', 3713)
('.', 2281)
(',', 2077)
('of', 1877)
('and', 1736)
('to', 1420)
('in', 1411)
(')', 892)
('-', 874)
('(', 865)
Adjective-noun combinations in the context#
# query for the first 10 ADJ-NOUN combinations
q = """
SELECT ?a1 ?a WHERE {
SERVICE <http://localhost:3030/nifigator/sparql> {
?s rdf:type nif:Word .
?s nif:pos olia:CommonNoun .
?s nif:anchorOf ?a .
?s nif:previousWord [
nif:pos olia:Adjective ;
nif:anchorOf ?a1
]
}
}
LIMIT 10
"""
# execute the query
results = graph.query(q)
# print the results
for result in results:
print((result[0].value, result[1].value))
This returns
('Annual', 'Report')
('supervisory', 'authorities')
('financial', 'crime')
('illegal', 'use')
('non-commercial', 'purposes')
('wide', 'availability')
('terrorist', 'financing')
('regular', 'supervision')
('pre-pandemic', 'levels')
('new', 'market')
# All two-word phrases ending with the lemma 'insurer' and starting with an adjective
q = """
SELECT distinct ?c ?a WHERE {
SERVICE <http://localhost:3030/nifigator/sparql> {
?s rdf:type nif:Word .
?s nif:lemma "insurer"^^xsd:string .
?s nif:anchorOf ?a .
?s nif:previousWord [
nif:pos olia:Adjective ;
nif:anchorOf ?c ;
]
}
}
"""
# execute the query
results = graph.query(q)
# print the results
for result in results:
print((result[0].value, result[1].value))
This gives:
('eligible', 'insurers')
('Non-life', 'insurers')
('Dutch', 'insurers')
('relevant', 'insurers')
('non-life', 'insurers')
('individual', 'insurers')