In [1]:
import pandas as pd

Load data

The store sells 5 items:

  1. Milk
  2. Jam
  3. Bread
  4. Wheat Bread
  5. Butter

We have have 4 transactions. In the first one, 3 products were purchased: Milk, Bread and Wheat Bread.

In [2]:
data = [
  ['Milk', 'Bread', 'Wheat Bread'],
  ['Jam', 'Bread', 'Butter'],
  ['Milk', 'Jam', 'Butter'],
  ['Jam', 'Butter']
]
data
Out[2]:
[['Milk', 'Bread', 'Wheat Bread'],
 ['Jam', 'Bread', 'Butter'],
 ['Milk', 'Jam', 'Butter'],
 ['Jam', 'Butter']]
In [3]:
from mlxtend.preprocessing import TransactionEncoder
encoder = TransactionEncoder()
In [4]:
df = pd.DataFrame(
    encoder.fit_transform(data),
    columns=encoder.columns_
)
df
Out[4]:
Bread Butter Jam Milk Wheat Bread
0 True False False True True
1 True True True False False
2 False True True True False
3 False True True False False

Using mlxtend

In [5]:
from mlxtend.frequent_patterns import apriori
In [6]:
df_support = apriori(df, min_support=0.1)
df_support
Out[6]:
support itemsets
0 0.50 (0)
1 0.75 (1)
2 0.75 (2)
3 0.50 (3)
4 0.25 (4)
5 0.25 (0, 1)
6 0.25 (0, 2)
7 0.25 (0, 3)
8 0.25 (0, 4)
9 0.75 (1, 2)
10 0.25 (1, 3)
11 0.25 (2, 3)
12 0.25 (3, 4)
13 0.25 (0, 1, 2)
14 0.25 (0, 3, 4)
15 0.25 (1, 2, 3)
In [7]:
df_support = apriori(df, min_support=0.1, use_colnames=True)
df_support
Out[7]:
support itemsets
0 0.50 (Bread)
1 0.75 (Butter)
2 0.75 (Jam)
3 0.50 (Milk)
4 0.25 (Wheat Bread)
5 0.25 (Bread, Butter)
6 0.25 (Bread, Jam)
7 0.25 (Bread, Milk)
8 0.25 (Bread, Wheat Bread)
9 0.75 (Jam, Butter)
10 0.25 (Butter, Milk)
11 0.25 (Jam, Milk)
12 0.25 (Wheat Bread, Milk)
13 0.25 (Bread, Butter, Jam)
14 0.25 (Bread, Wheat Bread, Milk)
15 0.25 (Jam, Butter, Milk)
In [8]:
from mlxtend.frequent_patterns import association_rules
In [9]:
rules = association_rules(
    df_support, metric="confidence", min_threshold=0.7)
rules
Out[9]:
antecedents consequents antecedent support consequent support support confidence lift leverage conviction
0 (Wheat Bread) (Bread) 0.25 0.50 0.25 1.0 2.000000 0.1250 inf
1 (Jam) (Butter) 0.75 0.75 0.75 1.0 1.333333 0.1875 inf
2 (Butter) (Jam) 0.75 0.75 0.75 1.0 1.333333 0.1875 inf
3 (Wheat Bread) (Milk) 0.25 0.50 0.25 1.0 2.000000 0.1250 inf
4 (Bread, Butter) (Jam) 0.25 0.75 0.25 1.0 1.333333 0.0625 inf
5 (Bread, Jam) (Butter) 0.25 0.75 0.25 1.0 1.333333 0.0625 inf
6 (Bread, Wheat Bread) (Milk) 0.25 0.50 0.25 1.0 2.000000 0.1250 inf
7 (Bread, Milk) (Wheat Bread) 0.25 0.25 0.25 1.0 4.000000 0.1875 inf
8 (Wheat Bread, Milk) (Bread) 0.25 0.50 0.25 1.0 2.000000 0.1250 inf
9 (Wheat Bread) (Bread, Milk) 0.25 0.25 0.25 1.0 4.000000 0.1875 inf
10 (Jam, Milk) (Butter) 0.25 0.75 0.25 1.0 1.333333 0.0625 inf
11 (Butter, Milk) (Jam) 0.25 0.75 0.25 1.0 1.333333 0.0625 inf
In [10]:
rules[rules['antecedents'] == {'Butter'}]
Out[10]:
antecedents consequents antecedent support consequent support support confidence lift leverage conviction
2 (Butter) (Jam) 0.75 0.75 0.75 1.0 1.333333 0.1875 inf

If someone buys butter, we're 75% confident he'll also buy jam

From scratch

In [11]:
MIN_SUPPORT = 0.1

df_support = pd.DataFrame(columns=['support', 'itemsets'])
n = len(df)

# For each item, count the number of times they appear
for col in df.columns:
    support = df[col].sum()/n

    if support > MIN_SUPPORT:
        df_support.loc[len(df_support)] = [
            support, frozenset([col])
        ]
df_support
Out[11]:
support itemsets
0 0.50 (Bread)
1 0.75 (Butter)
2 0.75 (Jam)
3 0.50 (Milk)
4 0.25 (Wheat Bread)
In [12]:
import itertools
In [13]:
# For each item that appears at least 10% of times
# Create combinations of 2
offset_1 = len(df_support)

for a, b in itertools.combinations(df_support['itemsets'], r=2):
    c = frozenset([*a, *b])
    print(c)

    support = df[c].all(axis=1).sum()/n

    if support > MIN_SUPPORT:
        df_support.loc[len(df_support)] = [
            support, c
        ]

df_support
frozenset({'Bread', 'Butter'})
frozenset({'Bread', 'Jam'})
frozenset({'Bread', 'Milk'})
frozenset({'Bread', 'Wheat Bread'})
frozenset({'Jam', 'Butter'})
frozenset({'Butter', 'Milk'})
frozenset({'Butter', 'Wheat Bread'})
frozenset({'Jam', 'Milk'})
frozenset({'Jam', 'Wheat Bread'})
frozenset({'Wheat Bread', 'Milk'})
Out[13]:
support itemsets
0 0.50 (Bread)
1 0.75 (Butter)
2 0.75 (Jam)
3 0.50 (Milk)
4 0.25 (Wheat Bread)
5 0.25 (Bread, Butter)
6 0.25 (Bread, Jam)
7 0.25 (Bread, Milk)
8 0.25 (Bread, Wheat Bread)
9 0.75 (Jam, Butter)
10 0.25 (Butter, Milk)
11 0.25 (Jam, Milk)
12 0.25 (Wheat Bread, Milk)
In [14]:
# For each combinations of 2 that appears at least 10% of times
# Create combinations of 3 (by adding combinations of 1 >10%)
seen = set()

for a, b in itertools.product(
    df_support[:offset_1]['itemsets'],
    df_support[offset_1:]['itemsets']
):
    c = frozenset([*a, *b])

    # Check we've got a combination of 3
    if len(c) != 3: continue
    if c in seen: continue
    seen.add(c)

    # Check support
    print(c)
    support = df[c].all(axis=1).sum()/n

    if support > MIN_SUPPORT:
        df_support.loc[len(df_support)] = [
            support, c
        ]

seen = None
df_support
frozenset({'Bread', 'Butter', 'Jam'})
frozenset({'Bread', 'Butter', 'Milk'})
frozenset({'Bread', 'Milk', 'Jam'})
frozenset({'Bread', 'Wheat Bread', 'Milk'})
frozenset({'Bread', 'Butter', 'Wheat Bread'})
frozenset({'Jam', 'Butter', 'Milk'})
frozenset({'Butter', 'Wheat Bread', 'Milk'})
frozenset({'Jam', 'Bread', 'Wheat Bread'})
frozenset({'Jam', 'Wheat Bread', 'Milk'})
frozenset({'Jam', 'Butter', 'Wheat Bread'})
Out[14]:
support itemsets
0 0.50 (Bread)
1 0.75 (Butter)
2 0.75 (Jam)
3 0.50 (Milk)
4 0.25 (Wheat Bread)
5 0.25 (Bread, Butter)
6 0.25 (Bread, Jam)
7 0.25 (Bread, Milk)
8 0.25 (Bread, Wheat Bread)
9 0.75 (Jam, Butter)
10 0.25 (Butter, Milk)
11 0.25 (Jam, Milk)
12 0.25 (Wheat Bread, Milk)
13 0.25 (Bread, Butter, Jam)
14 0.25 (Bread, Wheat Bread, Milk)
15 0.25 (Jam, Butter, Milk)
In [15]:
df_support['length'] = df_support['itemsets'].apply(lambda x: len(x))
df_support
Out[15]:
support itemsets length
0 0.50 (Bread) 1
1 0.75 (Butter) 1
2 0.75 (Jam) 1
3 0.50 (Milk) 1
4 0.25 (Wheat Bread) 1
5 0.25 (Bread, Butter) 2
6 0.25 (Bread, Jam) 2
7 0.25 (Bread, Milk) 2
8 0.25 (Bread, Wheat Bread) 2
9 0.75 (Jam, Butter) 2
10 0.25 (Butter, Milk) 2
11 0.25 (Jam, Milk) 2
12 0.25 (Wheat Bread, Milk) 2
13 0.25 (Bread, Butter, Jam) 3
14 0.25 (Bread, Wheat Bread, Milk) 3
15 0.25 (Jam, Butter, Milk) 3
In [16]:
df_support[df_support['length'] == 2]
Out[16]:
support itemsets length
5 0.25 (Bread, Butter) 2
6 0.25 (Bread, Jam) 2
7 0.25 (Bread, Milk) 2
8 0.25 (Bread, Wheat Bread) 2
9 0.75 (Jam, Butter) 2
10 0.25 (Butter, Milk) 2
11 0.25 (Jam, Milk) 2
12 0.25 (Wheat Bread, Milk) 2