InvertedIndex

The inverted index

  • allows to lookup all appearances of a token in a collection of tokenized sets
  • returns inverted lists consisting of (set, element) tuples
  • supports full sets/elements and positional indexes of them
  • stores source sets in SilkMothEngine

The inverted list

  • is sorted first by the order of the sets and then by the order of the elements.

Examples

>>> from silkmoth.inverted_index import InvertedIndex
>>> S1 = [{"Apple", "Pear", "Car"}, {"Apple", "Sun", "Cat"}]
>>> S2 = [{"Apple", "Berlin", "Sun"}, {"Apple"}]
>>> S = [S1, S2]
>>> I = InvertedIndex(S)
>>> I.get_indexes("Sun")
[(0, 1), (1, 0)]
>>> I["Berlin"]
[([{'Sun', 'Apple', 'Berlin'}, {'Apple'}], {'Sun', 'Apple', 'Berlin'})]

SilkMoth Inverted Index

SilkMoth Inverted Index. Source: Deng et al., "SILKMOTH: An Efficient Method for Finding Related Sets with Maximum Matching Constraints", VLDB 2017.
Licensed under CC BY-NC-ND 4.0.

Source code in src/silkmoth/inverted_index.py
class InvertedIndex:
    """
    The inverted index

    - allows to lookup all appearances of a token in a collection of tokenized sets
    - returns inverted lists consisting of (set, element) tuples
    - supports full sets/elements and positional indexes of them
    - stores source sets in [SilkMothEngine](silkmoth_engine.md)

    The inverted list

    - is sorted first by the order of the sets and then by the order of the elements.

    Examples
    --------
    ```
    >>> from silkmoth.inverted_index import InvertedIndex
    >>> S1 = [{"Apple", "Pear", "Car"}, {"Apple", "Sun", "Cat"}]
    >>> S2 = [{"Apple", "Berlin", "Sun"}, {"Apple"}]
    >>> S = [S1, S2]
    >>> I = InvertedIndex(S)
    >>> I.get_indexes("Sun")
    [(0, 1), (1, 0)]
    >>> I["Berlin"]
    [([{'Sun', 'Apple', 'Berlin'}, {'Apple'}], {'Sun', 'Apple', 'Berlin'})]
    ```

    ![SilkMoth Inverted Index](../figures/InvertedIndex.png)

    *SilkMoth Inverted Index. Source: Deng et al., "SILKMOTH: An Efficient Method for Finding Related Sets with Maximum Matching Constraints", VLDB 2017.  
    Licensed under CC BY-NC-ND 4.0.*
    """

    def __init__(self, token_sets: list):
        """
        Initialize the inverted index.

        Args:
            token_sets (list): Collection of tokenized sets
            sim_func (function): Similarity function
        """
        #self.sim_func = sim_func
        self.token_sets = []
        self.lookup_table = dict()

        for set_idx, token_set in enumerate(token_sets):
            self.token_sets.append(token_set)
            for element_idx, tokens in enumerate(token_set):
                for token in tokens:
                    key = (set_idx, element_idx)
                    if token not in self.lookup_table:
                        self.lookup_table[token] = [key]
                    elif self.lookup_table[token][-1] != key:
                        self.lookup_table[token].append(key)

    def keys(self):
        """
        Gives all tokens similar like dict.keys().

        Returns:
            set (set): A set-like object providing all keys
        """
        return self.lookup_table.keys()

    def __getitem__(self, token) -> list:
        """
        Access inverted list from inverted index using square brackets.

        Args:
            token (str): Input token

        Returns:
            list:   A list of all (set, element) tuples which contain the input 
                    token.
        """
        idx_list = self.get_indexes(token)
        return [(self.get_set(s), self.get_set(s)[e]) for s, e in idx_list]

    def get_indexes(self, token) -> list:
        """
        Access inverted list of indexes. For some tasks retrieving the full set
        and element pairs might not be necessary and their indexes are 
        sufficient.

        Args:
            token (str): Input token

        Returns:
            list:   A list of all (set index, element index) tuples for (set, 
                    element) tuples which contain the input tuple
        """
        if not token in self.lookup_table:
            raise ValueError(f"Unknown token") 
        return self.lookup_table[token] 

    def get_set(self, set_id: int) -> list:
        """
        Access (tokenized) set from set ID.

        Args:
            set_id: Set ID

        Returns:
            list: Tokenized set
        """
        if set_id < 0 or set_id >= len(self.token_sets):
            raise ValueError(f"Invalid id")
        return self.token_sets[set_id]

    def get_indexes_binary(self, token, set_idx) -> list:
        """
        Uses binary search to get all (set_idx, element_idx) pairs for a token
        where set_idx matches the given set_idx.

        Args:
            token (str): The token to search in the inverted index.
            set_idx (int): The ID of the set we want the element indexes for.

        Returns:
            list: All (set_idx, element_idx) tuples where the token appears in the given set.
        """
        if token not in self.lookup_table:
            raise ValueError("Unknown token")

        index_list = self.lookup_table[token]

        # Using bisect to find the range of entries where set_idx matches
        left = bisect.bisect_left(index_list, (set_idx, -1))
        right = bisect.bisect_right(index_list, (set_idx, float('inf')))
        return index_list[left:right]

    def print_index(self):
        """
        Prints the inverted index in a readable format.
        """
        print("=== Inverted Index ===")
        for token, locations in self.lookup_table.items():
            print(f"Token: {token} → Locations: {locations}")

__getitem__(token)

Access inverted list from inverted index using square brackets.

Parameters:

Name Type Description Default
token str

Input token

required

Returns:

Name Type Description
list list

A list of all (set, element) tuples which contain the input token.

Source code in src/silkmoth/inverted_index.py
def __getitem__(self, token) -> list:
    """
    Access inverted list from inverted index using square brackets.

    Args:
        token (str): Input token

    Returns:
        list:   A list of all (set, element) tuples which contain the input 
                token.
    """
    idx_list = self.get_indexes(token)
    return [(self.get_set(s), self.get_set(s)[e]) for s, e in idx_list]

__init__(token_sets)

Initialize the inverted index.

Parameters:

Name Type Description Default
token_sets list

Collection of tokenized sets

required
sim_func function

Similarity function

required
Source code in src/silkmoth/inverted_index.py
def __init__(self, token_sets: list):
    """
    Initialize the inverted index.

    Args:
        token_sets (list): Collection of tokenized sets
        sim_func (function): Similarity function
    """
    #self.sim_func = sim_func
    self.token_sets = []
    self.lookup_table = dict()

    for set_idx, token_set in enumerate(token_sets):
        self.token_sets.append(token_set)
        for element_idx, tokens in enumerate(token_set):
            for token in tokens:
                key = (set_idx, element_idx)
                if token not in self.lookup_table:
                    self.lookup_table[token] = [key]
                elif self.lookup_table[token][-1] != key:
                    self.lookup_table[token].append(key)

get_indexes(token)

Access inverted list of indexes. For some tasks retrieving the full set and element pairs might not be necessary and their indexes are sufficient.

Parameters:

Name Type Description Default
token str

Input token

required

Returns:

Name Type Description
list list

A list of all (set index, element index) tuples for (set, element) tuples which contain the input tuple

Source code in src/silkmoth/inverted_index.py
def get_indexes(self, token) -> list:
    """
    Access inverted list of indexes. For some tasks retrieving the full set
    and element pairs might not be necessary and their indexes are 
    sufficient.

    Args:
        token (str): Input token

    Returns:
        list:   A list of all (set index, element index) tuples for (set, 
                element) tuples which contain the input tuple
    """
    if not token in self.lookup_table:
        raise ValueError(f"Unknown token") 
    return self.lookup_table[token] 

get_indexes_binary(token, set_idx)

Uses binary search to get all (set_idx, element_idx) pairs for a token where set_idx matches the given set_idx.

Parameters:

Name Type Description Default
token str

The token to search in the inverted index.

required
set_idx int

The ID of the set we want the element indexes for.

required

Returns:

Name Type Description
list list

All (set_idx, element_idx) tuples where the token appears in the given set.

Source code in src/silkmoth/inverted_index.py
def get_indexes_binary(self, token, set_idx) -> list:
    """
    Uses binary search to get all (set_idx, element_idx) pairs for a token
    where set_idx matches the given set_idx.

    Args:
        token (str): The token to search in the inverted index.
        set_idx (int): The ID of the set we want the element indexes for.

    Returns:
        list: All (set_idx, element_idx) tuples where the token appears in the given set.
    """
    if token not in self.lookup_table:
        raise ValueError("Unknown token")

    index_list = self.lookup_table[token]

    # Using bisect to find the range of entries where set_idx matches
    left = bisect.bisect_left(index_list, (set_idx, -1))
    right = bisect.bisect_right(index_list, (set_idx, float('inf')))
    return index_list[left:right]

get_set(set_id)

Access (tokenized) set from set ID.

Parameters:

Name Type Description Default
set_id int

Set ID

required

Returns:

Name Type Description
list list

Tokenized set

Source code in src/silkmoth/inverted_index.py
def get_set(self, set_id: int) -> list:
    """
    Access (tokenized) set from set ID.

    Args:
        set_id: Set ID

    Returns:
        list: Tokenized set
    """
    if set_id < 0 or set_id >= len(self.token_sets):
        raise ValueError(f"Invalid id")
    return self.token_sets[set_id]

keys()

Gives all tokens similar like dict.keys().

Returns:

Name Type Description
set set

A set-like object providing all keys

Source code in src/silkmoth/inverted_index.py
def keys(self):
    """
    Gives all tokens similar like dict.keys().

    Returns:
        set (set): A set-like object providing all keys
    """
    return self.lookup_table.keys()

print_index()

Prints the inverted index in a readable format.

Source code in src/silkmoth/inverted_index.py
def print_index(self):
    """
    Prints the inverted index in a readable format.
    """
    print("=== Inverted Index ===")
    for token, locations in self.lookup_table.items():
        print(f"Token: {token} → Locations: {locations}")