Source code for efficient_apriori.itemsets

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Implementations of algorithms related to itemsets.
"""

import itertools
import numbers
import typing
import collections
from dataclasses import field, dataclass
import collections.abc


@dataclass
class ItemsetCount:
    itemset_count: int = 0
    members: set = field(default_factory=set)


class TransactionManager:
    # The brilliant transaction manager idea is due to:
    # https://github.com/ymoch/apyori/blob/master/apyori.py

    def __init__(self, transactions: typing.Iterable[typing.Iterable[typing.Hashable]]):
        # A lookup that returns indices of transactions for each item
        self.indices_by_item = collections.defaultdict(set)

        # Populate
        i = -1
        for i, transaction in enumerate(transactions):
            for item in transaction:
                self.indices_by_item[item].add(i)

        # Total number of transactions
        self._transactions = i + 1

    @property
    def items(self):
        return set(self.indices_by_item.keys())

    def __len__(self):
        return self._transactions

    def transaction_indices(self, transaction: typing.Iterable[typing.Hashable]):
        """Return the indices of the transaction."""

        transaction = set(transaction)  # Copy
        item = transaction.pop()
        indices = self.indices_by_item[item]
        while transaction:
            item = transaction.pop()
            indices = indices.intersection(self.indices_by_item[item])
        return indices

    def transaction_indices_sc(self, transaction: typing.Iterable[typing.Hashable], min_support: float = 0):
        """Return the indices of the transaction, with short-circuiting.

        Returns (over_or_equal_to_min_support, set_of_indices)
        """

        # Sort items by number of transaction rows the item appears in,
        # starting with the item beloning to the most transactions
        transaction = sorted(transaction, key=lambda item: len(self.indices_by_item[item]), reverse=True)

        # Pop item appearing in the fewest
        item = transaction.pop()
        indices = self.indices_by_item[item]
        support = len(indices) / len(self)
        if support < min_support:
            return False, None

        # The support is a non-increasing function
        # Sorting by number of transactions the items appear in is a heuristic
        # to make the support drop as quickly as possible
        while transaction:
            item = transaction.pop()
            indices = indices.intersection(self.indices_by_item[item])
            support = len(indices) / len(self)
            if support < min_support:
                return False, None

        # No short circuit happened
        return True, indices


def join_step(itemsets: typing.List[tuple]):
    """
    Join k length itemsets into k + 1 length itemsets.

    This algorithm assumes that the list of itemsets are sorted, and that the
    itemsets themselves are sorted tuples. Instead of always enumerating all
    n^2 combinations, the algorithm only has n^2 runtime for each block of
    itemsets with the first k - 1 items equal.

    Parameters
    ----------
    itemsets : list of itemsets
        A list of itemsets of length k, to be joined to k + 1 length
        itemsets.

    Examples
    --------
    >>> # This is an example from the 1994 paper by Agrawal et al.
    >>> itemsets = [(1, 2, 3), (1, 2, 4), (1, 3, 4), (1, 3, 5), (2, 3, 4)]
    >>> list(join_step(itemsets))
    [(1, 2, 3, 4), (1, 3, 4, 5)]
    """
    i = 0
    # Iterate over every itemset in the itemsets
    while i < len(itemsets):
        # The number of rows to skip in the while-loop, initially set to 1
        skip = 1

        # Get all but the last item in the itemset, and the last item
        *itemset_first, itemset_last = itemsets[i]

        # We now iterate over every itemset following this one, stopping
        # if the first k - 1 items are not equal. If we're at (1, 2, 3),
        # we'll consider (1, 2, 4) and (1, 2, 7), but not (1, 3, 1)

        # Keep a list of all last elements, i.e. tail elements, to perform
        # 2-combinations on later on
        tail_items = [itemset_last]
        tail_items_append = tail_items.append  # Micro-optimization

        # Iterate over ever itemset following this itemset
        for j in range(i + 1, len(itemsets)):
            # Get all but the last item in the itemset, and the last item
            *itemset_n_first, itemset_n_last = itemsets[j]

            # If it's the same, append and skip this itemset in while-loop
            if itemset_first == itemset_n_first:
                # Micro-optimization
                tail_items_append(itemset_n_last)
                skip += 1

            # If it's not the same, break out of the for-loop
            else:
                break

        # For every 2-combination in the tail items, yield a new candidate
        # itemset, which is sorted.
        itemset_first_tuple = tuple(itemset_first)
        for a, b in itertools.combinations(tail_items, 2):
            yield itemset_first_tuple + (a,) + (b,)

        # Increment the while-loop counter
        i += skip


def prune_step(itemsets: typing.Iterable[tuple], possible_itemsets: typing.List[tuple]):
    """
    Prune possible itemsets whose subsets are not in the list of itemsets.

    Parameters
    ----------
    itemsets : list of itemsets
        A list of itemsets of length k.
    possible_itemsets : list of itemsets
        A list of possible itemsets of length k + 1 to be pruned.

    Examples
    -------
    >>> itemsets = [('a', 'b', 'c'), ('a', 'b', 'd'),
    ...             ('b', 'c', 'd'), ('a', 'c', 'd')]
    >>> possible_itemsets = list(join_step(itemsets))
    >>> list(prune_step(itemsets, possible_itemsets))
    [('a', 'b', 'c', 'd')]
    """

    # For faster lookups
    itemsets = set(itemsets)

    # Go through every possible itemset
    for possible_itemset in possible_itemsets:
        # Remove 1 from the combination, same as k-1 combinations
        # The itemsets created by removing the last two items in the possible
        # itemsets must be part of the itemsets by definition,
        # due to the way the `join_step` function merges the sorted itemsets

        for i in range(len(possible_itemset) - 2):
            removed = possible_itemset[:i] + possible_itemset[i + 1 :]

            # If every k combination exists in the set of itemsets,
            # yield the possible itemset. If it does not exist, then it's
            # support cannot be large enough, since supp(A) >= supp(AB) for
            # all B, and if supp(S) is large enough, then supp(s) must be large
            # enough for every s which is a subset of S.
            # This is the downward-closure property of the support function.
            if removed not in itemsets:
                break

        # If we have not breaked yet
        else:
            yield possible_itemset


def apriori_gen(itemsets: typing.List[tuple]):
    """
    Compute all possible k + 1 length supersets from k length itemsets.

    This is done efficiently by using the downward-closure property of the
    support function, which states that if support(S) > k, then support(s) > k
    for every subset s of S.

    Parameters
    ----------
    itemsets : list of itemsets
        A list of itemsets of length k.

    Examples
    -------
    >>> # This is an example from the 1994 paper by Agrawal et al.
    >>> itemsets = [(1, 2, 3), (1, 2, 4), (1, 3, 4), (1, 3, 5), (2, 3, 4)]
    >>> possible_itemsets = list(join_step(itemsets))
    >>> list(prune_step(itemsets, possible_itemsets))
    [(1, 2, 3, 4)]
    """
    possible_extensions = join_step(itemsets)
    yield from prune_step(itemsets, possible_extensions)


[docs] def itemsets_from_transactions( transactions: typing.Iterable[typing.Union[set, tuple, list]], min_support: float, max_length: int = 8, verbosity: int = 0, output_transaction_ids: bool = False, ): """ Compute itemsets from transactions by building the itemsets bottom up and iterating over the transactions to compute the support repedately. This is the heart of the Apriori algorithm by Agrawal et al. in the 1994 paper. Parameters ---------- transactions : a list of itemsets (tuples/sets/lists with hashable entries) min_support : float The minimum support of the itemsets, i.e. the minimum frequency as a percentage. max_length : int The maximum length of the itemsets. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. output_transaction_ids : bool If set to true, the output contains the ids of transactions that contain a frequent itemset. The ids are the enumeration of the transactions in the sequence they appear. Examples -------- >>> # This is an example from the 1994 paper by Agrawal et al. >>> transactions = [(1, 3, 4), (2, 3, 5), (1, 2, 3, 5), (2, 5)] >>> itemsets, _ = itemsets_from_transactions(transactions, min_support=2/5) >>> itemsets[1] == {(1,): 2, (2,): 3, (3,): 3, (5,): 3} True >>> itemsets[2] == {(1, 3): 2, (2, 3): 2, (2, 5): 3, (3, 5): 2} True >>> itemsets[3] == {(2, 3, 5): 2} True """ # STEP 0 - Sanitize user inputs # ----------------------------- if not (isinstance(min_support, numbers.Number) and (0 <= min_support <= 1)): raise ValueError("`min_support` must be a number between 0 and 1.") # Store in transaction manager manager = TransactionManager(transactions) # If no transactions are present transaction_count = len(manager) if transaction_count == 0: return dict(), 0 # large_itemsets, num_transactions # STEP 1 - Generate all large itemsets of size 1 # ---------------------------------------------- if verbosity > 0: print("Generating itemsets.") print(" Counting itemsets of length 1.") candidates: typing.Dict[tuple, int] = {(item,): len(indices) for item, indices in manager.indices_by_item.items()} large_itemsets: typing.Dict[int, typing.Dict[tuple, int]] = { 1: {item: count for (item, count) in candidates.items() if (count / len(manager)) >= min_support} } if verbosity > 0: print(" Found {} candidate itemsets of length 1.".format(len(manager.items))) print(" Found {} large itemsets of length 1.".format(len(large_itemsets.get(1, dict())))) if verbosity > 1: print(" {}".format(list(item for item in large_itemsets.get(1, dict()).keys()))) # If large itemsets were found, convert to dictionary if not large_itemsets.get(1, dict()): return dict(), 0 # large_itemsets, num_transactions # STEP 2 - Build up the size of the itemsets # ------------------------------------------ # While there are itemsets of the previous size k = 2 while large_itemsets[k - 1] and (max_length != 1): if verbosity > 0: print(" Counting itemsets of length {}.".format(k)) # STEP 2a) - Build up candidate of larger itemsets # Retrieve the itemsets of the previous size, i.e. of size k - 1 # They must be sorted to maintain the invariant when joining/pruning itemsets_list = sorted(item for item in large_itemsets[k - 1].keys()) # Gen candidates of length k + 1 by joining, prune, and copy as set # This algorithm assumes that the list of itemsets are sorted, # and that the itemsets themselves are sorted tuples C_k: typing.List[tuple] = list(apriori_gen(itemsets_list)) if verbosity > 0: print(" Found {} candidate itemsets of length {}.".format(len(C_k), k)) if verbosity > 1: print(" {}".format(C_k)) # If no candidate itemsets were found, break out of the loop if not C_k: break # Prepare counts of candidate itemsets (from the prune step) if verbosity > 1: print(" Iterating over transactions.") # Keep only large transactions found_itemsets: typing.Dict[tuple, int] = dict() for candidate in C_k: over_min_support, indices = manager.transaction_indices_sc(candidate, min_support=min_support) if over_min_support: found_itemsets[candidate] = len(indices) # If no itemsets were found, break out of the loop if not found_itemsets: break # Candidate itemsets were found, add them large_itemsets[k] = {i: counts for (i, counts) in found_itemsets.items()} if verbosity > 0: num_found = len(large_itemsets[k]) print(" Found {} large itemsets of length {}.".format(num_found, k)) if verbosity > 1: print(" {}".format(list(large_itemsets[k].keys()))) k += 1 # Break out if we are about to consider larger itemsets than the max if k > max_length: break if verbosity > 0: print("Itemset generation terminated.\n") if output_transaction_ids: itemsets_out = { length: { item: ItemsetCount(itemset_count=count, members=manager.transaction_indices(set(item))) for (item, count) in itemsets.items() } for (length, itemsets) in large_itemsets.items() } return itemsets_out, len(manager) return large_itemsets, len(manager)
if __name__ == "__main__": import pytest pytest.main(args=[".", "--doctest-modules", "-v"])