Source code for nifigator.nifvecobjects

# -*- coding: utf-8 -*-

import logging
from collections import OrderedDict, defaultdict, deque, Counter
from typing import Union, List, Optional
from itertools import combinations, product

import regex as re
from iribaker import to_iri
from rdflib import Graph, Namespace
from rdflib.store import Store
from rdflib.term import IdentifiedNode, URIRef, Literal
from rdflib.plugins.stores import sparqlstore, memory
from rdflib.namespace import NamespaceManager
from .const import (
    STOPWORDS,
    RDF,
    XSD,
    NIF,
    NIFVEC,
    ONTOLEX,
    LEXINFO,
    DECOMP,
    DEFAULT_URI,
    DEFAULT_PREFIX,
    MIN_PHRASE_COUNT,
    MIN_CONTEXT_COUNT,
    MIN_PHRASECONTEXT_COUNT,
    MAX_PHRASE_LENGTH,
    MAX_CONTEXT_LENGTH,
    TRIPLE_BATCH_SIZE,
    CONTEXT_SEPARATOR,
    PHRASE_SEPARATOR,
    WORDS_FILTER,
    FORCED_SENTENCE_SPLIT_CHARACTERS,
    REGEX_FILTER,
)
from .utils import tokenizer, tokenize_text, to_iri
from .nifgraph import NifGraph
from .multisets import merge_multiset

default_min_phrase_count = 2
default_min_phrasecontext_count = 2
default_min_context_count = 2
default_max_context_length = 5
default_max_phrase_length = 5
default_context_separator = "_"
default_phrase_separator = "+"
default_regex_filter = None  # "^[0-9]*[a-zA-Z]*$"


[docs]class NifVectorGraph(NifGraph): """ A NIF Vector graph :param nif_graph (NifGraph): the graph from which to construct the NIF Vector graph (optional) :param context_uris (list): the context uris of the contexts used with the nif_graph to construct the NIF Vector graph (optional) :param documents (list): the documents from which to construct the NIF Vector graph (optional) :param base_uri (Namespace): the namespace of the nifvec data :param lang (str): the language of the nifvec data :param params (dict): parameters for constructing the NIF Vector graph """ def __init__( self, nif_graph: NifGraph = None, context_uris: list = None, documents: list = None, base_uri: Namespace = Namespace(DEFAULT_URI + "nifvec-data/"), lang: str = None, params: dict = {}, store: Union[Store, str] = "default", identifier: Optional[Union[IdentifiedNode, str]] = None, namespace_manager: Optional[NamespaceManager] = None, base: Optional[str] = None, bind_namespaces: str = "core", ): super(NifVectorGraph, self).__init__( store=store, identifier=identifier, namespace_manager=namespace_manager, base=base, bind_namespaces=bind_namespaces, ) self.params = params words_filter = params.get(WORDS_FILTER, None) if words_filter is not None: # reformulate to dict for efficiency self.params[WORDS_FILTER]["data"] = { phrase: True for phrase in words_filter["data"] } else: self.params[WORDS_FILTER] = None self.base_uri = base_uri self.lang = lang self.bind("nifvec-data", base_uri) self.bind("nifvec", NIFVEC) self.bind("nif", NIF) self.bind("ontolex", ONTOLEX) self.bind("lexinfo", LEXINFO) self.bind("decomp", DECOMP) if nif_graph is not None: # if nif_graph is available then contexts are extracted from this graph logging.debug(".. extracting documents from graph") documents = dict() contexts = nif_graph.contexts for context in contexts: # if context_uris is None then all contexts are extracted # otherwise only those in the context_uris list if context_uris is None or context.uri in context_uris: isString = context.isString if isString is not None: documents[context.uri] = preprocess(isString, self.params) else: logging.warning("No isString found for " + str(context.uri)) if documents is not None: phrases = generate_document_phrases(documents=documents, params=self.params) contexts, phrases = generate_document_contexts( init_phrases=phrases, documents=documents, params=self.params ) self.store_triples( phrases=phrases, contexts=contexts, )
[docs] def store_triples( self, phrases: dict = {}, contexts: dict = {}, ): """ Function to store the triples from a document set into the NifVector graph. The triples are loaded in batches into the NifVector graph, to prevent the number of SPARQL updates. :param phrases: dictionary of all phrases to be stored :param contexts: dictionary of all contexts to be stored """ triple_batch_size = self.params.get(TRIPLE_BATCH_SIZE, 5e6) count = 1 temp_g = Graph() for triple in self.generate_triples(phrases=phrases, contexts=contexts): temp_g.add(triple) if count == triple_batch_size: self += temp_g count = 1 temp_g = Graph() else: count += 1 self += temp_g logging.debug(".. finished storing triples")
[docs] def generate_triples( self, phrases: dict = {}, contexts: dict = {}, ): """ Function to create all triples of a set of documents :param phrases: dictionary of all phrases to be stored :param contexts: dictionary of all contexts to be stored """ logging.debug(".. collecting triples") context_sep = self.params.get(CONTEXT_SEPARATOR, default_context_separator) phrase_sep = self.params.get(PHRASE_SEPARATOR, default_phrase_separator) # to add: nifvec graph definition, which contexts? which language, stopwords for phrase, value in phrases.items(): phrase_uri = URIRef(self.base_uri + to_iri(phrase)) phrase_value = Literal(phrase.replace(phrase_sep, " "), datatype=XSD.string) count = Literal(value, datatype=XSD.nonNegativeInteger) yield ((phrase_uri, RDF.type, NIF.Phrase)) yield ((phrase_uri, RDF.value, phrase_value)) yield ((phrase_uri, NIFVEC.hasCount, count)) logging.debug(".... finished triples for phrases") for ((left_part, right_part)), value in contexts.items(): context_uri = URIRef( self.base_uri + to_iri(left_part) + context_sep + to_iri(right_part) ) left_context_value = Literal( left_part.replace(phrase_sep, " "), datatype=XSD.string ) right_context_value = Literal( right_part.replace(phrase_sep, " "), datatype=XSD.string ) context_count = Literal( sum(v for v in value.values()), datatype=XSD.nonNegativeInteger ) yield ((context_uri, RDF.type, NIFVEC.Context)) yield ((context_uri, NIFVEC.hasLeftValue, left_context_value)) yield ((context_uri, NIFVEC.hasRightValue, right_context_value)) yield ((context_uri, NIFVEC.hasCount, context_count)) logging.debug(".... finished triples for contexts") for ((left_part, right_part)), value in contexts.items(): context_uri = URIRef( self.base_uri + to_iri(left_part) + context_sep + to_iri(right_part) ) for phrase, phrase_value in value.items(): window_uri = URIRef( self.base_uri + to_iri(left_part) + context_sep + to_iri(phrase) + context_sep + to_iri(right_part) ) phrase_uri = URIRef(self.base_uri + to_iri(phrase)) window_count = Literal(phrase_value, datatype=XSD.nonNegativeInteger) yield ((window_uri, RDF.type, NIFVEC.Window)) yield ((window_uri, NIFVEC.hasContext, context_uri)) yield ((window_uri, NIFVEC.hasPhrase, phrase_uri)) yield ((window_uri, NIFVEC.hasCount, window_count)) yield ((phrase_uri, NIFVEC.isPhraseOf, window_uri)) yield ((context_uri, NIFVEC.isContextOf, window_uri)) logging.debug(".... finished triples for windows")
[docs] def phrase_contexts( self, phrase: str = None, phrase_uri: URIRef = None, left: str = None, right: str = None, topn: int = 15, ): """ Function that returns the contexts of a phrase :param phrase: the phrase from which to derive the contexts (as a string) :param phrase_uri: the phrase from which to derive the contexts (as a uri) :param left: the left side of the context (optional, as a string) :param right: the right side of the context (optional, as a string) :param topn: restrict output to topn (default = 15) """ context_sep = self.params.get(CONTEXT_SEPARATOR, default_context_separator) phrase_sep = self.params.get(PHRASE_SEPARATOR, default_phrase_separator) if phrase_uri is None: phrase_uri = URIRef(self.base_uri + phrase_sep.join(phrase.split(" "))) q = """ SELECT DISTINCT ?value_left ?value_right (sum(?count) as ?n) WHERE {\n""" q += ( """ { """ + phrase_uri.n3() + """ nifvec:isPhraseOf ?w . ?w rdf:type nifvec:Window . ?w nifvec:hasContext ?c . ?w nifvec:hasCount ?count . """ ) if left is not None: q += '?c nifvec:hasLeftValue "' + Literal(left) + '" .\n' q += "?c nifvec:hasLeftValue ?value_left .\n" if right is not None: q += '?c nifvec:hasRightValue "' + Literal(right) + '" .\n' q += "?c nifvec:hasRightValue ?value_right .\n" q += """ } } GROUP BY ?value_left ?value_right ORDER BY DESC(?n) """ if topn is not None: q += "LIMIT " + str(topn) + "\n" results = Counter( {tuple([r[0].value, r[1].value]): r[2].value for r in self.query(q)} ) return results
[docs] def most_similar( self, phrase: str = None, phrase_uri: URIRef = None, context: str = None, context_uri: URIRef = None, contexts: list = None, contexts_uris: list = None, topn: int = 15, topcontexts: int = 25, topphrases: int = 25, ): """ Function that returns most similar phrases of a phrase :param phrase: the phrase from which to derive similar phrases (as a string) :param phrase_uri: the phrase from which to derive similar phrases (as a uri) :param context: the context to take into account for deriving similar phrases (as a string) :param context_uri: the context to take into account for deriving similar phrases (as a uri) :param contexts: use list of contexts to filter :param contexts_uris: filter contexts :param topn: restrict output to topn (default = 15) :param topcontexts: number of similar contexts to use when using phrase or phrase_uri :param topphrases: number of similar phrases to use when using context or context_uri """ phrase_sep = self.params.get(PHRASE_SEPARATOR, default_phrase_separator) context_sep = self.params.get(CONTEXT_SEPARATOR, default_context_separator) if phrase is not None: phrase_uri = URIRef(self.base_uri + phrase_sep.join(phrase.split(" "))) if context is not None: context_uri = URIRef( self.base_uri + context_sep.join([c.replace(" ", phrase_sep) for c in context]) ) if contexts is not None: contexts_uris = [ URIRef( self.base_uri + context_sep.join([c.replace(" ", phrase_sep) for c in context]) ) for context in contexts ] q = """ SELECT distinct ?v (count(?c) as ?num1) WHERE {\n""" q += """ {""" if phrase_uri is not None: q += ( """ { SELECT DISTINCT ?c (sum(?count1) as ?n1) WHERE { """ + phrase_uri.n3() + """ nifvec:isPhraseOf ?w1 . ?w1 rdf:type nifvec:Window . ?w1 nifvec:hasContext ?c . ?w1 nifvec:hasCount ?count1 . } GROUP BY ?c ORDER BY DESC(?n1) LIMIT """ + str(topcontexts) + """ } """ ) if context_uri is not None: q += ( """ { SELECT DISTINCT ?p (sum(?count2) as ?n2) WHERE { """ + context_uri.n3() + """ nifvec:isContextOf ?w2 . ?w2 rdf:type nifvec:Window . ?w2 nifvec:hasPhrase ?p . ?w2 nifvec:hasCount ?count2 . } GROUP BY ?p ORDER BY DESC(?n2) LIMIT """ + str(topphrases) + """ } """ ) q += """ ?p nifvec:isPhraseOf ?w . ?c nifvec:isContextOf ?w . ?w rdf:type nifvec:Window . ?p rdf:value ?v .""" if contexts_uris is not None: q += "FILTER (?c IN (" for contexts_uri in contexts_uris: q += contexts_uri.n3() if contexts_uri != contexts_uris[-1]: q += ", " q += "))" q += """ } } GROUP BY ?v ORDER BY DESC (?num1) """ if topn is not None: q += "LIMIT " + str(topn) + "\n" results = [item for item in self.query(q)] if len(results) > 0: norm = results[0][1].value results = dict({r[0].value: (r[1].value, norm) for r in results}) else: results = dict() return results
[docs] def extract_rdf_type(self, rdf_type: str = None, topn: int = None): """ """ context_sep = self.params.get(CONTEXT_SEPARATOR, default_context_separator) phrase_sep = self.params.get(PHRASE_SEPARATOR, default_phrase_separator) q = """ SELECT distinct ?v (sum(?count) as ?num) WHERE {\n""" q += ( """ { ?w rdf:type """ + rdf_type + """ . ?w nifvec:hasCount ?count . ?w rdf:value ?v . } } GROUP BY ?v ORDER BY DESC (?num) """ ) if topn is not None: q += "LIMIT " + str(topn) + "\n" results = [item for item in self.query(q)] if rdf_type == "nif:Phrase": results = {r[0].replace(phrase_sep, " "): r[1].value for r in results} elif rdf_type == "nifvec:Context": results = {r[0].replace(context_sep, " "): r[1].value for r in results} return results
[docs] def phrases(self, topn: int = None): """ Returns phrases with their counts in the graph """ q = """ SELECT distinct ?v (sum(?count) as ?num) WHERE {\n""" q += """ { ?w rdf:type nif:Phrase . ?w nifvec:hasCount ?count . ?w rdf:value ?v . } } GROUP BY ?v ORDER BY DESC (?num) """ if topn is not None: q += "LIMIT " + str(topn) + "\n" results = Counter({r[0].value: r[1].value for r in self.query(q)}) return results
[docs] def dict_phrases_contexts( g, word: str = None, topn: int = 7, topcontexts: int = 10 ): """ """ contexts = g.phrase_contexts(word, topn=topcontexts) phrases = g.most_similar(word, topn=topn, topcontexts=topcontexts) d = { "index": phrases.keys(), "columns": contexts.keys(), "data": [], "index_names": ["phrase"], "column_names": ["left context phrase", "right context phrase"], } for phrase in phrases: phrase_contexts = g.phrase_contexts(phrase, topn=None) d["data"].append([phrase_contexts.get(c, 0) for c in contexts.keys()]) return d
[docs] def context_phrases( self, context: tuple = None, left: str = None, right: str = None, topn: int = 15 ): """ Function that returns the phrases of a context """ context_sep = self.params.get(CONTEXT_SEPARATOR, default_context_separator) phrase_sep = self.params.get(PHRASE_SEPARATOR, default_phrase_separator) if context is not None: context = ( phrase_sep.join(context[0].split(" ")), phrase_sep.join(context[1].split(" ")), ) context_uri = URIRef(self.base_uri + context_sep.join(context)).n3() q = """ SELECT distinct ?v (sum(?s) as ?num) WHERE {\n""" q += """ {""" if context is not None: q += context_uri + " nifvec:isContextOf ?window ." if left is not None: q += '?context nifvec:hasLeftValue "' + Literal(left) + '" . ' if right is not None: q += '?context nifvec:hasRightValue "' + Literal(right) + '" . ' q += """ ?context nifvec:isContextOf ?window . ?window rdf:type nifvec:Window . ?window nifvec:hasCount ?s . ?phrase nifvec:isPhraseOf ?window . ?phrase rdf:value ?v . } } GROUP BY ?v ORDER BY DESC(?num) """ if topn is not None: q += "LIMIT " + str(topn) + "\n" results = Counter({r[0].value: r[1].value for r in self.query(q)}) return results
[docs] def compact(self): """ This function compacts the NifVector graph by replacing all hasCount triples by one sum hasCount triple """ logging.info("Compacting") logging.info(".. stage 1 / 3") self.update( """ INSERT { ?s nifvec:hasTotalCount ?tc } WHERE { { SELECT ?s (sum(?c) as ?tc) WHERE { ?s nifvec:hasCount ?c } GROUP BY ?s } } """ ) logging.info(".. stage 2 / 3") self.update( """ DELETE { ?s nifvec:hasCount ?c } WHERE { ?s nifvec:hasCount ?c } """ ) logging.info(".. stage 3 / 3") self.update( """ DELETE { ?s nifvec:hasTotalCount ?c } INSERT { ?s nifvec:hasCount ?c } WHERE { ?s nifvec:hasTotalCount ?c } """ ) logging.info(".. finished") return None
[docs] def find_otherForms( self, phrase: str = None, phrase_uri: URIRef = None, ): """ """ phrase_sep = self.params.get(PHRASE_SEPARATOR, default_phrase_separator) if phrase_uri is None: phrase_uri = URIRef(self.base_uri + phrase_sep.join(phrase.split(" "))) q = ( """ SELECT distinct ?f (sum(?c) as ?num1) WHERE { """ + phrase_uri.n3() + """ rdf:value ?v . { ?e ontolex:canonicalForm [ ontolex:writtenRep ?v ] . } UNION { ?e ontolex:otherForm [ ontolex:writtenRep ?v ] . } ?e ontolex:otherForm|ontolex:canonicalForm [ ontolex:writtenRep ?f ] . ?p rdf:value ?f . ?p nifvec:hasCount ?c . ?p rdf:type nif:Phrase . } GROUP BY ?f ORDER BY ?num1 """ ) results = [item for item in self.query(q)] if len(results) > 0: results = dict({r[0].value: r[1].value for r in results}) else: results = dict() return results
# setup a dictionary with phrases and their contexts to speed up
[docs] def load_vectors( self, documents: dict = None, vectors: dict = None, topn: int = 15, includePhraseVectors: bool = True, includeContextVectors: bool = False, includeOtherForms: bool = False, ): """ Function to retrieve the vectors of phrases and context of a set of documents """ if vectors is None: vectors = dict() params = { WORDS_FILTER: {"data": {phrase: True for phrase in STOPWORDS}}, MIN_PHRASE_COUNT: 1, } documents = { key: preprocess(value, self.params) for key, value in documents.items() } phrases = generate_document_phrases(documents=documents, params=params) for phrase in phrases.keys(): if includePhraseVectors: if vectors.get(phrase, None) is None: if includeOtherForms: vector = Counter() for form in self.find_otherForms(phrase): vector += self.phrase_contexts(form, topn=topn) else: vector = self.phrase_contexts(phrase, topn=topn) vectors[phrase] = vector if includeContextVectors: if vectors.get(context, None) is None: vectors[context] = self.context_phrases(context, topn=topn) return vectors
[docs]def document_vector( documents: dict = None, vectors: dict = None, includePhraseVectors: bool = True, includeContextVectors: bool = False, topn: int = 15, merge_dict: bool = False, params: dict = None, ): """ extract the phrases of a string and create dict of phrases with their contexts """ params = { WORDS_FILTER: {"data": {phrase: True for phrase in STOPWORDS}}, MIN_PHRASE_COUNT: 1, MIN_CONTEXT_COUNT: 1, MIN_PHRASECONTEXT_COUNT: 1, } phrase_sep = params.get(PHRASE_SEPARATOR, default_phrase_separator) documents = {key: preprocess(value, params) for key, value in documents.items()} phrases = generate_document_phrases(documents=documents, params=params) if includeContextVectors: contexts, phrases = generate_document_contexts( documents=documents, init_phrases=phrases, params=params ) res = dict() if includePhraseVectors: for phrase in phrases.keys(): p = phrase.replace(phrase_sep, " ") if p not in vectors.keys(): logging.debug("Phrase " + repr(p) + " not found in vectors.") else: res[p] = Counter( { key: value for key, value in vectors.get(p, Counter()).most_common(topn) } ) if includeContextVectors: for left, right in contexts.keys(): c = (left.replace(phrase_sep, " "), right.replace(phrase_sep, " ")) if c not in vectors.keys(): logging.debug("Context " + repr(c) + " not found in vectors.") else: res[c] = Counter( { key: value for key, value in vectors.get(c, Counter()).most_common(topn) } ) if merge_dict: res = merge_multiset(res) return res
[docs]def generate_document_contexts( init_phrases: dict = None, documents: dict = None, params: dict = {} ): """ """ logging.debug(".. generate document contexts started") max_context_length = params.get(MAX_CONTEXT_LENGTH, default_max_context_length) min_context_count = params.get(MIN_CONTEXT_COUNT, default_min_context_count) min_phrasecontext_count = params.get( MIN_PHRASECONTEXT_COUNT, default_min_phrasecontext_count ) phrase_sep = params.get(PHRASE_SEPARATOR, default_phrase_separator) init_contexts = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) for phrase, docs in init_phrases.items(): for doc, locs in docs.items(): for sent_idx, begin_idx, end_idx in locs: sent = documents[doc][sent_idx] if begin_idx - 1 >= 0 and end_idx + 1 <= len(sent): l = sent[begin_idx - 1] r = sent[end_idx] init_contexts[(l, r)][phrase][doc].add( (sent_idx, begin_idx, end_idx) ) del init_phrases to_process_contexts = dict() for d_context, d_phrases in init_contexts.items(): if len(d_phrases.keys()) > 1: to_process_contexts[d_context] = (d_phrases, 1, 1) # aggegrate results into contexts dict final_contexts = defaultdict(Counter) for d_context, d_phrases in init_contexts.items(): d_phrase_counter = Counter( { d_phrase: sum(len(loc) for loc in docs.values()) for d_phrase, docs in d_phrases.items() if sum(len(loc) for loc in docs.values()) >= min_phrasecontext_count } ) if ( len(d_phrase_counter.keys()) > 0 and sum(v for v in d_phrase_counter.values()) >= min_context_count ): final_contexts[d_context] = d_phrase_counter else: if d_context in to_process_contexts.keys(): del to_process_contexts[d_context] del init_contexts logging.debug(".... added contexts: " + str(len(to_process_contexts))) while to_process_contexts != dict(): new_contexts = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) for d_context, ( (d_phrases, left_size, right_size) ) in to_process_contexts.items(): # print("evaluating "+str(d_context) + ": "+str(left_size)+", "+str(right_size)) for phrase, docs in d_phrases.items(): for doc, locs in docs.items(): for sent_idx, begin_idx, end_idx in locs: sent = documents[doc][sent_idx] if d_context == ( phrase_sep.join(sent[begin_idx - left_size : begin_idx]), phrase_sep.join(sent[end_idx : end_idx + right_size]), ): # right if ( begin_idx - left_size >= 0 and end_idx + right_size + 1 <= len(sent) ): l = phrase_sep.join( sent[begin_idx - left_size : begin_idx] ) r = phrase_sep.join( sent[end_idx : end_idx + right_size + 1] ) # print(".. (right) adding " +str((l, r))) new_contexts[(l, r)][phrase][doc].add( (sent_idx, begin_idx, end_idx) ) # left if ( begin_idx - left_size - 1 >= 0 and end_idx + right_size <= len(sent) ): l = phrase_sep.join( sent[begin_idx - left_size - 1 : begin_idx] ) r = phrase_sep.join( sent[end_idx : end_idx + right_size] ) # print(".. (left) adding " +str((l, r))) new_contexts[(l, r)][phrase][doc].add( (sent_idx, begin_idx, end_idx) ) # determine contexts for further processing to_process_contexts = dict() for ((left_part, right_part)), d_phrases in new_contexts.items(): if ( len(d_phrases.keys()) > 1 and len(left_part.split(phrase_sep)) < max_context_length and len(right_part.split(phrase_sep)) < max_context_length ): to_process_contexts[(left_part, right_part)] = ( d_phrases, len(left_part.split(phrase_sep)), len(right_part.split(phrase_sep)), ) # add new contexts to contexts for d_context, d_phrases in new_contexts.items(): d_phrase_counter = Counter( { d_phrase: sum(len(loc) for loc in docs.values()) for d_phrase, docs in d_phrases.items() if sum(len(loc) for loc in docs.values()) >= min_phrasecontext_count } ) if ( len(d_phrase_counter.keys()) > 0 and sum(v for v in d_phrase_counter.values()) >= min_context_count ): final_contexts[d_context] = d_phrase_counter else: if d_context in to_process_contexts.keys(): del to_process_contexts[d_context] logging.debug(".... added contexts: " + str(len(to_process_contexts))) # create final phrases dict from contexts phrases = Counter() for d_context, d_phrases in final_contexts.items(): for phrase, value in d_phrases.items(): phrases[phrase] += value logging.debug(".. generate document contexts finished") logging.debug(".... total contexts: " + str(len(final_contexts.keys()))) logging.debug(".... total phrases: " + str(len(phrases.keys()))) return final_contexts, phrases
[docs]def generate_document_phrases(documents: dict = None, params: dict = {}): """ This function generates all phrases in the documents :param documents: a dict with context.uri as keys and context.isString as values :param params: a dict with parameters """ logging.debug(".. generating document phrases") min_phrase_count = params.get(MIN_PHRASE_COUNT, default_min_phrase_count) # create a dict for each phrase that contain the phrase locations phrases = defaultdict(lambda: defaultdict(set)) for context_uri, context_isString in documents.items(): for phrase, loc in generate_sentence_phrases(context_isString, params=params): phrases[phrase][context_uri].add(loc) # delete all phrases that occur less than then the min_phrase_count to_delete = set() for phrase, docs in phrases.items(): if sum(len(loc) for loc in docs.values()) < min_phrase_count: to_delete.add(phrase) for phrase in to_delete: del phrases[phrase] logging.debug(".... found phrases: " + str(len(phrases.keys()))) return phrases
[docs]def generate_sentence_phrases( sentences: list = None, params: dict = {}, ): """ Generator for all phrases and their location in the sentences """ phrase_sep = params.get(PHRASE_SEPARATOR, default_phrase_separator) words_filter = params.get(WORDS_FILTER, None) max_phrase_length = params.get(MAX_PHRASE_LENGTH, default_max_phrase_length) for sent_idx, sentence in enumerate(sentences): for word_idx, word in enumerate(sentence): for phrase_length in range(1, max_phrase_length + 1): if word_idx + phrase_length <= len(sentence): phrase_list = [ sentence[word_idx + i] for i in range(0, phrase_length) ] phrase = phrase_sep.join(word for word in phrase_list) if words_filter is None: yield ( phrase, (sent_idx, word_idx, word_idx + phrase_length), ) else: # phrases may not start or end with one of the stopwords phrase_stop_words = [ words_filter["data"].get(word.lower(), False) for word in [phrase_list[0], phrase_list[-1]] ] if not any(phrase_stop_words): yield ( phrase, (sent_idx, word_idx, word_idx + phrase_length), )
[docs]def preprocess( document: str = None, params: dict = {}, ): """ """ split_characters = params.get(FORCED_SENTENCE_SPLIT_CHARACTERS, []) regex_filter = params.get(REGEX_FILTER, default_regex_filter) # tokenize documents into sentences sentences = [ [word["text"] for word in sentence] for sentence in tokenize_text(document, split_characters) ] if regex_filter is not None: # select tokens given a regex filter and add start and end of sentence tokens SENTSTART and SENTEND preprocessed = [ ["SENTSTART"] + [word for word in sentence if re.match(regex_filter, word)] + ["SENTEND"] for sentence in sentences ] else: preprocessed = [ ["SENTSTART"] + sentence + ["SENTEND"] for sentence in sentences ] return preprocessed