Tokenizer
Source code in src/silkmoth/tokenizer.py
class Tokenizer:
def __init__(self, sim_func, q=3):
"""
Initialize the Tokenizer with a similarity function.
Args:
sim_func (callable): The similarity function that influences tokenization behavior.
q (int): The q-gram size for tokenization, default is 3.
"""
self.sim_func = sim_func
self.q = q
def tokenize(self, input_set: list) -> list:
"""
Tokenizes the input based on the similarity function.
Args:
input_set: The input set to tokenize.
Returns:
list: A list of str tokens extracted from the input.
"""
if self.sim_func == jaccard_similarity:
tokens = jaccard_tokenize(input_set)
elif self.sim_func == edit_similarity or self.sim_func == N_edit_similarity:
tokens = qgram_tokenize(input_set, self.q)
else:
raise ValueError("Unsupported similarity function")
return tokens
__init__(sim_func, q=3)
Initialize the Tokenizer with a similarity function.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
sim_func
|
callable
|
The similarity function that influences tokenization behavior. |
required |
q
|
int
|
The q-gram size for tokenization, default is 3. |
3
|
Source code in src/silkmoth/tokenizer.py
def __init__(self, sim_func, q=3):
"""
Initialize the Tokenizer with a similarity function.
Args:
sim_func (callable): The similarity function that influences tokenization behavior.
q (int): The q-gram size for tokenization, default is 3.
"""
self.sim_func = sim_func
self.q = q
tokenize(input_set)
Tokenizes the input based on the similarity function.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
input_set
|
list
|
The input set to tokenize. |
required |
Returns:
| Name | Type | Description |
|---|---|---|
list |
list
|
A list of str tokens extracted from the input. |
Source code in src/silkmoth/tokenizer.py
def tokenize(self, input_set: list) -> list:
"""
Tokenizes the input based on the similarity function.
Args:
input_set: The input set to tokenize.
Returns:
list: A list of str tokens extracted from the input.
"""
if self.sim_func == jaccard_similarity:
tokens = jaccard_tokenize(input_set)
elif self.sim_func == edit_similarity or self.sim_func == N_edit_similarity:
tokens = qgram_tokenize(input_set, self.q)
else:
raise ValueError("Unsupported similarity function")
return tokens
jaccard_tokenize(input_set)
Tokenizes the input using Jaccard similarity.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
input_set
|
list
|
The input set to tokenize. |
required |
Returns:
| Name | Type | Description |
|---|---|---|
list |
list
|
A list of str tokens extracted from the input string. |
Source code in src/silkmoth/tokenizer.py
def jaccard_tokenize(input_set: list) -> list:
"""
Tokenizes the input using Jaccard similarity.
Args:
input_set: The input set to tokenize.
Returns:
list: A list of str tokens extracted from the input string.
"""
tokens = []
for element in input_set:
if isinstance(element, (str, int, float, bool)):
tokens.append(set(str(element).split()))
elif isinstance(element, (list, tuple)):
sub_tokens = set()
for sub_element in element:
if isinstance(sub_element, (str, int, float, bool)):
sub_tokens.update(str(sub_element).split())
elif isinstance(sub_element, (list, tuple)):
for sub_sub_element in sub_element:
if isinstance(sub_sub_element, (str, int, float, bool)):
sub_tokens.update(str(sub_sub_element).split())
else:
raise ValueError(
f"Unsupported nested type: {type(sub_element)}"
)
else:
raise ValueError(
f"Unsupported nested type: {type(sub_element)}"
)
tokens.append(sub_tokens)
else:
raise ValueError(f"Unsupported element type: {type(element)}")
return tokens
qgram_tokenize(input_set, q)
Tokenizes the input using q-gram tokenization.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
input_set
|
list
|
Input set with strings or nested values. |
required |
q
|
int
|
Length of q-gram. |
required |
Returns:
| Type | Description |
|---|---|
list[list[str]]
|
list[list[str]]: A list of lists, each containing ordered q-gram tokens. |
Source code in src/silkmoth/tokenizer.py
def qgram_tokenize(input_set: list, q: int) -> list[list[str]]:
"""
Tokenizes the input using q-gram tokenization.
Args:
input_set (list): Input set with strings or nested values.
q (int): Length of q-gram.
Returns:
list[list[str]]: A list of lists, each containing ordered q-gram tokens.
"""
def to_qgrams(s: str) -> list[str]:
s = s.strip()
if len(s) < q:
return []
return [s[i:i+q] for i in range(len(s) - q + 1)]
def flatten(x):
for el in x:
if isinstance(el, (list, tuple)):
yield from flatten(el)
else:
yield el
tokens = []
for element in input_set:
if isinstance(element, (str, int, float, bool)):
s = str(element)
elif isinstance(element, (list, tuple)):
# Flatten nested elements and join with space
s = " ".join(str(x) for x in flatten(element))
else:
raise ValueError(f"Unsupported element type: {type(element)}")
tokens.append(to_qgrams(s)) # generate q-grams for the full string
return tokens