SigType
Bases: Enum
Signature type enum.
Source code in src/silkmoth/utils.py
class SigType(Enum):
"""
Signature type enum.
"""
WEIGHTED = "weighted"
SKYLINE = "skyline"
DICHOTOMY = "dichotomy"
N_edit_similarity(x, y, sim_thresh=0)
Computes the normalized edit similarity NEds between two strings or sets/lists of tokens: \(NEds(x, y) = 1 - LD(x, y) / max(|x|, |y|)\)
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
x
|
str or set/list of str
|
First input |
required |
y
|
str or set/list of str
|
Second input |
required |
sim_thresh
|
float
|
Similarity threshold (default: 0) |
0
|
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Similarity score in [0, 1], or 0 if below threshold |
Source code in src/silkmoth/utils.py
def N_edit_similarity(x, y, sim_thresh=0) -> float:
"""
Computes the normalized edit similarity NEds between two strings or sets/lists of tokens:
$NEds(x, y) = 1 - LD(x, y) / max(|x|, |y|)$
Args:
x (str or set/list of str): First input
y (str or set/list of str): Second input
sim_thresh (float): Similarity threshold (default: 0)
Returns:
float: Similarity score in [0, 1], or 0 if below threshold
"""
x_str = reverse_qgrams(x)
y_str = reverse_qgrams(y)
if not x_str or not y_str:
return .0
ld = Levenshtein.distance(x_str, y_str)
max_len = max(len(x_str), len(y_str))
if max_len == 0:
return 1.0
neds_score = 1 - (ld / max_len)
return neds_score if neds_score >= sim_thresh else .0
contain(reference_set_size, source_set_size, mm_score)
Computes Set-Containment metric which checks whether one set S is approximately a superset of another set R. Set pairs (R, S) with \(|R| > |S|\) should be filtered in advance. Set-Containment is defined as \(contain(R, S) = mm\_score / |R|\).
Examples
>>> from silkmoth.utils contain
>>> contain(2, 3, 2)
1.0
>>> contain(2, 3, 1.5)
0.75
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
reference_set_size
|
int
|
Size of set R |
required |
source_set_size
|
int
|
Size of set S |
required |
mm_score
|
float
|
Maximum matching score of R and S |
required |
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Set-Containment |
Source code in src/silkmoth/utils.py
def contain(reference_set_size: int, source_set_size: int, mm_score: float) -> float:
"""
Computes Set-Containment metric which checks whether one set S is approximately
a superset of another set R. Set pairs (R, S) with $|R| > |S|$ should be filtered
in advance. Set-Containment is defined as $contain(R, S) = mm\_score / |R|$.
Examples
--------
```
>>> from silkmoth.utils contain
>>> contain(2, 3, 2)
1.0
>>> contain(2, 3, 1.5)
0.75
```
Args:
reference_set_size: Size of set R
source_set_size: Size of set S
mm_score: Maximum matching score of R and S
Returns:
float: Set-Containment
"""
if reference_set_size > source_set_size:
raise ValueError(f"Reference set too large")
return mm_score / reference_set_size
edit_similarity(x, y, sim_thresh=0)
Computes the edit similarity between two strings based on the formula given in the SILKMOTH paper: \(Eds(x, y) = 1 - (2 * LD(x, y)) / (|x| + |y| + LD(x, y))\)
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
x
|
str or set/list of str
|
First input |
required |
y
|
str or set/list of str
|
Second input |
required |
sim_thresh
|
float
|
Similarity threshold alpha (default is 0) |
0
|
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Edit similarity score (0 if below threshold) |
Source code in src/silkmoth/utils.py
def edit_similarity (x, y, sim_thresh=0) -> float:
"""
Computes the edit similarity between two strings based on
the formula given in the SILKMOTH paper:
$Eds(x, y) = 1 - (2 * LD(x, y)) / (|x| + |y| + LD(x, y))$
Args:
x (str or set/list of str): First input
y (str or set/list of str): Second input
sim_thresh (float): Similarity threshold alpha (default is 0)
Returns:
float: Edit similarity score (0 if below threshold)
"""
x_str = reverse_qgrams(x)
y_str = reverse_qgrams(y)
if not x_str or not y_str:
return .0
ld = Levenshtein.distance(x_str, y_str)
eds = 1 - (2 * ld) / (len(x_str) + len(y_str) + ld)
return eds if eds >= sim_thresh else .0
flatten_tokens(input_val)
Flattens a set, list of sets, or other nested iterable into a flat list of strings.
Source code in src/silkmoth/utils.py
def flatten_tokens(input_val):
"""
Flattens a set, list of sets, or other nested iterable into a flat list of strings.
"""
if isinstance(input_val, (set, list)):
flat = []
for elem in input_val:
if isinstance(elem, (set, list)):
flat.extend(elem)
else:
flat.append(elem)
return " ".join(flat)
return input_val # assume it's already a string
jaccard_similarity(x, y, sim_thresh=0)
Gives the Jaccard similarity of two set-like objects. Jaccard similarity is defined as \(Jac(x, y) = |x \cap y|/|x \cup y|\).
For some applications we may want to omit pairs with low similarity. Therefore a similarity threshold α is provided. If the similarity score does not exceed this threshold, this function returns zero.
Examples
>>> from silkmoth.utils import jaccard_similarity
>>> x = {"a", "b", "c"}
>>> y = {"a", "b", "c"}
>>> jaccard_similarity(x, y)
1.0
>>> y.add("d")
>>> jaccard_similarity(x, y)
0.75
>>> jaccard_similarity(x, y, 0.8)
0.0
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
x
|
set
|
Input element x |
required |
y
|
set
|
Input element y |
required |
sim_thresh
|
float
|
Similarity threshold alpha |
0
|
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Jaccard similarity score |
Source code in src/silkmoth/utils.py
def jaccard_similarity(x: set, y: set, sim_thresh=0) -> float:
"""
Gives the Jaccard similarity of two set-like objects. Jaccard similarity is
defined as $Jac(x, y) = |x \cap y|/|x \cup y|$.
For some applications we may want to omit pairs with low similarity.
Therefore a similarity threshold α is provided. If the similarity score
does not exceed this threshold, this function returns zero.
Examples
--------
```
>>> from silkmoth.utils import jaccard_similarity
>>> x = {"a", "b", "c"}
>>> y = {"a", "b", "c"}
>>> jaccard_similarity(x, y)
1.0
>>> y.add("d")
>>> jaccard_similarity(x, y)
0.75
>>> jaccard_similarity(x, y, 0.8)
0.0
```
Args:
x (set): Input element x
y (set): Input element y
sim_thresh (float): Similarity threshold alpha
Returns:
float: Jaccard similarity score
"""
if len(x) == 0 or len(y) == 0:
return .0
jac = len(x & y) / len(x | y)
if jac >= sim_thresh:
return jac
return .0
reverse_qgrams(input_val)
Reverse qgrams back to their original text.
Source code in src/silkmoth/utils.py
def reverse_qgrams(input_val) -> str:
"""
Reverse qgrams back to their original text.
"""
if isinstance(input_val, (list,OrderedSet)):
if len(input_val) == 0:
return ""
if len(input_val) == 1:
return input_val[0]
result = ""
for gram in input_val[:-1]:
result += gram[0]
last_gram = input_val[-1]
return result + last_gram
return input_val # assume it's already a string
similar(reference_set_size, source_set_size, mm_score)
Computes Set-Similarity metric which checks whether two sets R and S are approximately equivalent. Set-Similarity is defined as \(similar(R, S) = mm\_score / (|R| + |S| - mm\_score)\).
Examples
>>> from silkmoth.utils import similar
>>> similar(3, 3, 3)
1.0
>>> similar(3, 3, 1.5)
0.3333333333333333
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
reference_set_size
|
int
|
Size of set R |
required |
source_set_size
|
int
|
Size of set S |
required |
mm_score
|
float
|
Maximum matching score of R and S |
required |
Returns:
| Name | Type | Description |
|---|---|---|
float |
float
|
Set-Similarity |
Source code in src/silkmoth/utils.py
def similar(reference_set_size: int, source_set_size: int, mm_score: float) -> float:
"""
Computes Set-Similarity metric which checks whether two sets R and S are approximately
equivalent. Set-Similarity is defined as $similar(R, S) = mm\_score / (|R| + |S| - mm\_score)$.
Examples
--------
```
>>> from silkmoth.utils import similar
>>> similar(3, 3, 3)
1.0
>>> similar(3, 3, 1.5)
0.3333333333333333
```
Args:
reference_set_size: Size of set R
source_set_size: Size of set S
mm_score: Maximum matching score of R and S
Returns:
float: Set-Similarity
"""
return mm_score / (reference_set_size + source_set_size - mm_score)