Verifier
The verifier component executes the final verification step in the SilkMoth pipeline. During verification SilkMoth performs the maximum matching between every candidate set and the reference set R. The sets whose maximum matching score surpass the relatedness threshold δ are the verified related sets to R.
For maximum matching computation we treat every element of the two sets as vertices of a bipartite graph and the weights of each edge determined by the similarity function. The maximum weighted matching is computed using the existing library SciPy.
Optionally, a triangle inequality-based reduction can be applied to further improve performance.
Examples
>>> from silkmoth.inverted_index import InvertedIndex
>>> from silkmoth.utils import similar, jaccard_similarity
>>> from silkmoth.verifier import Verifier
>>> S1 = [{"Apple", "Pear", "Car"}, {"Apple", "Sun", "Cat"}]
>>> S2 = [{"Apple", "Berlin", "Sun"}, {"Apple"}]
>>> S = [S1, S2]
>>> I = InvertedIndex(S)
>>> R = [{"Apple"}, {"Berlin", "Sun"}]
>>> verifier = Verifier(0.1, similar, jaccard_similarity)
>>> verifier.get_related_sets(R, {0, 1}, I)
[(0, 0.17073170731707313), (1, 0.7142857142857142)]
>>> verifier = Verifier(0.7, similar, jaccard_similarity)
>>> verifier.get_related_sets(R, {0, 1}, I)
[(1, 0.7142857142857142)]
Source code in src/silkmoth/verifier.py
class Verifier:
"""
The verifier component executes the final verification step in the SilkMoth
pipeline. During verification SilkMoth performs the maximum matching between
every candidate set and the reference set R. The sets whose maximum matching
score surpass the relatedness threshold δ are the verified related sets to R.
For maximum matching computation we treat every element of the two sets as
vertices of a bipartite graph and the weights of each edge determined by the
similarity function. The maximum weighted matching is computed using the existing
library [SciPy](https://scipy.org/).
Optionally, a triangle inequality-based reduction can be applied to further
improve performance.
Examples
--------
```
>>> from silkmoth.inverted_index import InvertedIndex
>>> from silkmoth.utils import similar, jaccard_similarity
>>> from silkmoth.verifier import Verifier
>>> S1 = [{"Apple", "Pear", "Car"}, {"Apple", "Sun", "Cat"}]
>>> S2 = [{"Apple", "Berlin", "Sun"}, {"Apple"}]
>>> S = [S1, S2]
>>> I = InvertedIndex(S)
>>> R = [{"Apple"}, {"Berlin", "Sun"}]
>>> verifier = Verifier(0.1, similar, jaccard_similarity)
>>> verifier.get_related_sets(R, {0, 1}, I)
[(0, 0.17073170731707313), (1, 0.7142857142857142)]
>>> verifier = Verifier(0.7, similar, jaccard_similarity)
>>> verifier.get_related_sets(R, {0, 1}, I)
[(1, 0.7142857142857142)]
```
"""
def __init__(self, related_thresh, sim_metric, sim_func, sim_thresh=0, reduction=False):
"""
Initialize the verifier with some parameters.
Args:
related_thresh (float): Relatedness threshold delta
sim_metric (callable): Similarity metric similar(...)/contain(...)
sim_func (callable): Similarity function phi
sim_thresh (float): Similarity threshold alpha
reduction (bool): Flag to activate/deactivate triangle inequality reduction
"""
self.related_thresh = related_thresh
self.sim_metric = sim_metric
self.sim_func = sim_func
self.sim_thresh = sim_thresh
self.reduction = reduction
def get_mm_score(self, reference_set, source_set) -> float:
"""
Helper function that computes the maximum weighted bipartite matching score,
where elements correspond to nodes and the edges are weighted using the similarity
function.
Args:
reference_set (list): Tokenized reference set R
source_set (list): Tokenized source set S
Returns:
float: Maximum matching score (sum of weights of edges in the
matching)
"""
n, m = len(reference_set), len(source_set)
if n == 0 or m == 0:
return 0.0
weights = np.zeros((n, m), dtype=float)
for i, r_elem in enumerate(reference_set):
for j, s_elem in enumerate(source_set):
weights[i, j] = self.sim_func(r_elem, s_elem, self.sim_thresh)
# use negative weights to search for minimal cost
cost = -weights
row_ind, col_ind = linear_sum_assignment(cost)
return float(weights[row_ind, col_ind].sum())
def get_relatedness(self, reference_set, source_set) -> float:
"""
Helper function that gives the relatedness score by computing the maximum weighted
bipartite matching.
Args:
reference_set (list): Tokenized reference set R
source_set (list): Tokenized source set S
Returns:
float: Relatedness score of R and S
"""
r_size = len(reference_set)
s_size = len(source_set)
exact_matches = 0
if self.reduction:
reference_set, source_set, exact_matches = reduce_sets(reference_set, source_set)
mm_score = self.get_mm_score(reference_set, source_set) + exact_matches
relatedness = self.sim_metric(r_size, s_size, mm_score)
return relatedness
def get_related_sets(self, reference_set: list, candidates: set, inverted_index: InvertedIndex) -> list:
"""
Gives all candidate sets that are related to the reference set.
Args:
reference_set (list): Tokeinized reference set R
candidates (set): Collection of indices of candidate sets
inverted_index (InvertedIndex): Inverted index instance
Returns:
list: Pairs of indices of all related sets from the candidates and
their relatedness with the reference set.
"""
related_sets = []
for c in candidates:
source_set = inverted_index.get_set(c)
relatedness = self.get_relatedness(reference_set, source_set)
if relatedness >= self.related_thresh:
related_sets.append((c, relatedness))
return related_sets
__init__(related_thresh, sim_metric, sim_func, sim_thresh=0, reduction=False)
Initialize the verifier with some parameters.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
related_thresh
|
float
|
Relatedness threshold delta |
required |
sim_metric
|
callable
|
Similarity metric similar(...)/contain(...) |
required |
sim_func
|
callable
|
Similarity function phi |
required |
sim_thresh
|
float
|
Similarity threshold alpha |
0
|
reduction
|
bool
|
Flag to activate/deactivate triangle inequality reduction |
False
|
Source code in src/silkmoth/verifier.py
def __init__(self, related_thresh, sim_metric, sim_func, sim_thresh=0, reduction=False):
"""
Initialize the verifier with some parameters.
Args:
related_thresh (float): Relatedness threshold delta
sim_metric (callable): Similarity metric similar(...)/contain(...)
sim_func (callable): Similarity function phi
sim_thresh (float): Similarity threshold alpha
reduction (bool): Flag to activate/deactivate triangle inequality reduction
"""
self.related_thresh = related_thresh
self.sim_metric = sim_metric
self.sim_func = sim_func
self.sim_thresh = sim_thresh
self.reduction = reduction
get_mm_score(reference_set, source_set)
Helper function that computes the maximum weighted bipartite matching score, where elements correspond to nodes and the edges are weighted using the similarity function.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
reference_set
|
list
|
Tokenized reference set R |
required |
source_set
|
list
|
Tokenized source set S |
required |
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Maximum matching score (sum of weights of edges in the matching) |
Source code in src/silkmoth/verifier.py
def get_mm_score(self, reference_set, source_set) -> float:
"""
Helper function that computes the maximum weighted bipartite matching score,
where elements correspond to nodes and the edges are weighted using the similarity
function.
Args:
reference_set (list): Tokenized reference set R
source_set (list): Tokenized source set S
Returns:
float: Maximum matching score (sum of weights of edges in the
matching)
"""
n, m = len(reference_set), len(source_set)
if n == 0 or m == 0:
return 0.0
weights = np.zeros((n, m), dtype=float)
for i, r_elem in enumerate(reference_set):
for j, s_elem in enumerate(source_set):
weights[i, j] = self.sim_func(r_elem, s_elem, self.sim_thresh)
# use negative weights to search for minimal cost
cost = -weights
row_ind, col_ind = linear_sum_assignment(cost)
return float(weights[row_ind, col_ind].sum())
get_related_sets(reference_set, candidates, inverted_index)
Gives all candidate sets that are related to the reference set.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
reference_set
|
list
|
Tokeinized reference set R |
required |
candidates
|
set
|
Collection of indices of candidate sets |
required |
inverted_index
|
InvertedIndex
|
Inverted index instance |
required |
Returns:
| Name | Type | Description |
|---|---|---|
list |
list
|
Pairs of indices of all related sets from the candidates and |
list
|
their relatedness with the reference set. |
Source code in src/silkmoth/verifier.py
def get_related_sets(self, reference_set: list, candidates: set, inverted_index: InvertedIndex) -> list:
"""
Gives all candidate sets that are related to the reference set.
Args:
reference_set (list): Tokeinized reference set R
candidates (set): Collection of indices of candidate sets
inverted_index (InvertedIndex): Inverted index instance
Returns:
list: Pairs of indices of all related sets from the candidates and
their relatedness with the reference set.
"""
related_sets = []
for c in candidates:
source_set = inverted_index.get_set(c)
relatedness = self.get_relatedness(reference_set, source_set)
if relatedness >= self.related_thresh:
related_sets.append((c, relatedness))
return related_sets
get_relatedness(reference_set, source_set)
Helper function that gives the relatedness score by computing the maximum weighted bipartite matching.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
reference_set
|
list
|
Tokenized reference set R |
required |
source_set
|
list
|
Tokenized source set S |
required |
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Relatedness score of R and S |
Source code in src/silkmoth/verifier.py
def get_relatedness(self, reference_set, source_set) -> float:
"""
Helper function that gives the relatedness score by computing the maximum weighted
bipartite matching.
Args:
reference_set (list): Tokenized reference set R
source_set (list): Tokenized source set S
Returns:
float: Relatedness score of R and S
"""
r_size = len(reference_set)
s_size = len(source_set)
exact_matches = 0
if self.reduction:
reference_set, source_set, exact_matches = reduce_sets(reference_set, source_set)
mm_score = self.get_mm_score(reference_set, source_set) + exact_matches
relatedness = self.sim_metric(r_size, s_size, mm_score)
return relatedness
reduce_sets(reference_set, source_set)
Applies the triangle inequality reduction by removing every element from both sets that has an identical match in the other set.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
reference_set
|
list
|
Tokenized reference set R |
required |
source_set
|
list
|
Tokenized source set S |
required |
Returns:
| Type | Description |
|---|---|
(list, list, int)
|
Reduced reference set, reduced source set and number of identical elements. |
Source code in src/silkmoth/verifier.py
def reduce_sets(reference_set: list, source_set: list) -> tuple:
"""
Applies the triangle inequality reduction by removing every element from
both sets that has an identical match in the other set.
Args:
reference_set: Tokenized reference set R
source_set: Tokenized source set S
Returns:
(list, list, int): Reduced reference set, reduced source set and number
of identical elements.
"""
r_reduced = reference_set[:]
s_reduced = source_set[:]
count = 0
for elem in reference_set:
if elem in s_reduced:
s_reduced.remove(elem)
r_reduced.remove(elem)
count += 1
return (r_reduced, s_reduced, count)