Source code for armine.armine

from io import open
from itertools import chain
try:
    # Python 2
    from itertools import ifilterfalse as filterfalse
except ImportError:
    # Pyhton 3
    from itertools import filterfalse
from beautifultable import BeautifulTable

from .utils import get_subsets
from .rule import AssociationRule


[docs]class ARM(object):
    """Utility class for Association Rule Mining.

    This class provides methods to generate a set of Association rules
    from a transactional dataset.
    """
    def __init__(self):
        self._dataset = []
        self._rules = []
        self._itemcounts = {}
        self.set_rule_key(lambda rule: (rule.lift, rule.confidence,
                                        len(rule.antecedent)))
        self._apparent_support_threshold = None
        self._apparent_confidence_threshold = None
        self._apparent_coverage_threshold = None
        self._real_support_threshold = float('inf')
        self._real_confidence_threshold = float('inf')
        self._real_coverage_threshold = float('inf')

    @property
    def rules(self):
        """Get a list of rules generated using the loaded dataset."""
        return list(filterfalse(
            lambda rule: (self._apparent_support_threshold > rule.coverage
                          or self._apparent_confidence_threshold > rule.confidence),
            self._rules))

    @property
    def support_threshold(self):
        return self._apparent_support_threshold

    @property
    def confidence_threshold(self):
        return self._apparent_confidence_threshold

    @property
    def coverage_threshold(self):
        return self._apparent_coverage_threhold

[docs]    def load(self, data):
        """Load a set of transactions from a Iterable of lists.

        Parameters
        ----------
        data : Iterable of lists
            List of transactions
        """
        self._clear()
        for row in data:
            self._dataset.append(list(row))

[docs]    def load_from_csv(self, filename):
        """Load a set of transactions from a csv file.

        Parameters
        ----------
        filename : string
            Name of the csv file which contains a set of transactions
        """
        self._clear()
        import csv
        with open(filename, newline='') as csvfile:
            mycsv = csv.reader(csvfile)
            for row in mycsv:
                self._dataset.append(row)

[docs]    def set_rule_key(self, key):
        """Set the key function which should be used to sort rules.

        The default key function sorts rules using lift, confidence and
        size of antecedent respectively. This behaviour can be changed
        using this method.

        Parameters
        ----------
        key : function
            The key function to sort rules
        """
        self._rule_key = key

    def _clear(self):
        self._dataset = []
        self._rules = []
        self._itemcounts = {}

    def _clean_items(self, items):
        return tuple(items)

    def _get_itemcount(self, items):
        try:
            return self._itemcounts[tuple(set(items))]
        except KeyError:
            pass
        count = 0
        for data in self._dataset:
            found = True
            for item in items:
                if item not in data:
                    found = False
                    break
            if found:
                count += 1
        return count

    def _get_initial_itemset(self):
        itemset = []
        items = set(chain(*self._dataset))
        for item in items:
            itemset.append([item])
        return sorted(itemset)

    def _should_join_candidate(self, candidate1, candidate2):
        for i in range(len(candidate1) - 1):
            if candidate1[i] != candidate2[i]:
                return False
        if candidate1[-1] != candidate2[-1]:
            return True
        return False

    def _get_nextgen_itemset(self, itemset):
        new_items = []
        for i, _ in enumerate(itemset):
            for j in range(i, len(itemset)):
                if self._should_join_candidate(itemset[i], itemset[j]):
                    new_items.append(sorted(set(itemset[i]).union(itemset[j])))
        return new_items

    def _prune_itemset(self, itemset):
        to_be_pruned = []
        for items in itemset:
            item_count = self._get_itemcount(items)
            item_support = round(item_count / len(self._dataset), 3)
            if item_support < self._real_support_threshold:
                to_be_pruned.append(items)

        for items in to_be_pruned:
            itemset.remove(items)

    def _prune_rules(self):
        pruned_rules = []
        data_cover_count = [0] * len(self._dataset)
        for rule in self._rules:
            rule_add = False
            for i, data in enumerate(self._dataset):
                items = self._clean_items(data)
                if (rule.match_antecedent(items)
                        and data_cover_count[i] >= 0):
                    rule_add = True
                    data_cover_count[i] += 1
                    if data_cover_count[i] >= self._real_coverage_threshold:
                        data_cover_count[i] = -1

            if rule_add:
                pruned_rules.append(rule)

        self._rules = pruned_rules

    def _print_items(self):
        for item, count in self._itemcounts.items():
            print(item, count)

    def _generate_rules(self, itemset):
        for items in itemset:
            subsets = get_subsets(items)
            for element in subsets:
                remain = set(items).difference(set(element))
                if len(remain) > 0:
                    count_lhs = self._get_itemcount(element)
                    count_rhs = self._get_itemcount(remain)
                    count_both = self._get_itemcount(items)
                    rule = AssociationRule(tuple(element), tuple(remain),
                                           count_both, count_lhs, count_rhs,
                                           len(self._dataset))
                    if (rule.confidence >= self._real_confidence_threshold):
                        self._rules.append(rule)

[docs]    def print_rules(self, attributes=('coverage', 'confidence', 'lift')):
        """Print the generated rules in a tabular format.

        Parameters
        ----------
        attributes : array_like
            pass
        """
        table = BeautifulTable()
        table.column_headers = (['Antecedent', 'Consequent']
                                + list(attr.replace('_', ' ').title()
                                       for attr in attributes))

        table.column_alignments[0] = table.ALIGN_LEFT
        table.column_alignments[1] = table.ALIGN_LEFT
        for rule in self.rules:
            table.append_row([rule.antecedent2str(),
                              rule.consequent2str()]
                             + list(getattr(rule, attr)
                                    for attr in attributes))

        print(table)

    def _learn(self, support_threshold, confidence_threshold,
               coverage_threshold):
        self._apparent_support_threshold = support_threshold
        self._apparent_confidence_threshold = confidence_threshold
        self._apparent_coverage_threshold = coverage_threshold
        
        self._real_support_threshold = support_threshold
        self._real_confidence_threshold = confidence_threshold
        self._real_coverage_threshold = coverage_threshold
        
        itemset = self._get_initial_itemset()
        self._rules = []
        while len(itemset) > 0:
            self._prune_itemset(itemset)
            self._generate_rules(itemset)
            itemset = self._get_nextgen_itemset(itemset)

        self._rules = list(set(self._rules))
        self._prune_rules()
        self._rules.sort(key=self._rule_key, reverse=True)

[docs]    def learn(self, support_threshold, confidence_threshold,
              coverage_threshold=20):
        """Generate Association rules from the Training dataset.

        Parameters
        ----------
        support_threshold : float
            User defined threshold between 0 and 1. Rules with support
            less than `support_threshold` are not generated.

        confidence_threshold : float
            User defined threshold between 0 and 1. Rules with confidence
            less than `confidence_threshold` are not generated.

        coverage_threshold : int
            Maximum number of rules, a specific transaction can match.
            After it exceeds this, That row is no longer considered for
            matching other rules. Using this process all rules are removed,
            which do not match any transaction left(Default 20).
        """
        if (support_threshold < self._real_support_threshold
                or confidence_threshold < self._real_confidence_threshold
                or coverage_threshold != self._real_coverage_threshold):
            self._learn(support_threshold, confidence_threshold,
                        coverage_threshold)

        self._apparent_support_threshold = support_threshold
        self._apparent_confidence_threshold = confidence_threshold
        self._apparent_coverage_threshold = coverage_threshold