Using a SPARQL endpoint#

Connecting to a (local) SPARQL endpoint#

You can use an existing SPARQL endpoint in the following way.

from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as default

# Connect to triplestore.
store = SPARQLUpdateStore()
query_endpoint = 'http://localhost:3030/nifigator/sparql'
update_endpoint = 'http://localhost:3030/nifigator/update'
store.open((query_endpoint, update_endpoint))

Then open a NifGraph in the same way as a rdflib.Graph

from nifigator import NifGraph

# Open a graph in the open store and set identifier to default graph ID.
graph = NifGraph(store=store, identifier=default)

We can then check the number of triples in the store

# count the number of triples in the store
print("Number of triples: "+str(len(graph)))
Number of triples: 1081392

To check the contexts in the graph you can use the catalog property. This property return a Pandas DataFrame with the context uris (in the index) with collection uri and the metadata (from DC and DCTERMS) available.

# get the catalog with all contexts within the graph
catalog = graph.catalog
catalog

It is sometimes necessary to compact the database. You can do that with the following command

curl -XPOST http://localhost:3030/$/compact/nifigator

Running SPARQL queries#

The total number of words in the collection#

# define the query for the total number of words
q = """
SELECT (count(?s) as ?num) WHERE {
    SERVICE <http://localhost:3030/nifigator/sparql> {
          ?s rdf:type nif:Word . 
    }
}
"""

# execute the query
results = graph.query(q)

# print the results
for result in results:
    print(result[0].value)

This returns

68070

The frequency of words per context#

# query for the frequency of words per context
q = """
SELECT ?w (count(?w) as ?num) WHERE {
    SERVICE <http://localhost:3030/nifigator/sparql> {
        ?s rdf:type nif:Word . 
        ?s nif:anchorOf ?w .
        ?s nif:referenceContext ?c .
    }
}
GROUP BY ?w
ORDER BY DESC(?num)
LIMIT 10
"""

# execute the query
results = graph.query(q)

# print the results
for result in results:
    print((result[0].value, result[1].value))

This returns

('the', 3713)
('.', 2281)
(',', 2077)
('of', 1877)
('and', 1736)
('to', 1420)
('in', 1411)
(')', 892)
('-', 874)
('(', 865)

Adjective-noun combinations in the context#

# query for the first 10 ADJ-NOUN combinations
q = """
SELECT ?a1 ?a WHERE {
    SERVICE <http://localhost:3030/nifigator/sparql> {
        ?s rdf:type nif:Word . 
        ?s nif:pos olia:CommonNoun .
        ?s nif:anchorOf ?a .
        ?s nif:previousWord [ 
            nif:pos olia:Adjective ;
            nif:anchorOf ?a1
        ]
    }
}
LIMIT 10
"""

# execute the query
results = graph.query(q)

# print the results
for result in results:
    print((result[0].value, result[1].value))

This returns

('Annual', 'Report')
('supervisory', 'authorities')
('financial', 'crime')
('illegal', 'use')
('non-commercial', 'purposes')
('wide', 'availability')
('terrorist', 'financing')
('regular', 'supervision')
('pre-pandemic', 'levels')
('new', 'market')
# All two-word phrases ending with the lemma 'insurer' and starting with an adjective
q = """
SELECT distinct ?c ?a WHERE {
    SERVICE <http://localhost:3030/nifigator/sparql> {
        ?s rdf:type nif:Word . 
        ?s nif:lemma "insurer"^^xsd:string .
        ?s nif:anchorOf ?a .
        ?s nif:previousWord [ 
            nif:pos olia:Adjective ;
            nif:anchorOf ?c ;
        ]
    }
}
"""

# execute the query
results = graph.query(q)

# print the results
for result in results:
    print((result[0].value, result[1].value))

This gives:

('eligible', 'insurers')
('Non-life', 'insurers')
('Dutch', 'insurers')
('relevant', 'insurers')
('non-life', 'insurers')
('individual', 'insurers')