import pandas as pd

Load data¶

The store sells 5 items:

Milk
Jam
Bread
Wheat Bread
Butter

We have have 4 transactions. In the first one, 3 products were purchased: Milk, Bread and Wheat Bread.

data = [
  ['Milk', 'Bread', 'Wheat Bread'],
  ['Jam', 'Bread', 'Butter'],
  ['Milk', 'Jam', 'Butter'],
  ['Jam', 'Butter']
]
data

[['Milk', 'Bread', 'Wheat Bread'],
 ['Jam', 'Bread', 'Butter'],
 ['Milk', 'Jam', 'Butter'],
 ['Jam', 'Butter']]

from mlxtend.preprocessing import TransactionEncoder
encoder = TransactionEncoder()

df = pd.DataFrame(
    encoder.fit_transform(data),
    columns=encoder.columns_
)
df

Using mlxtend¶

from mlxtend.frequent_patterns import apriori

df_support = apriori(df, min_support=0.1)
df_support

df_support = apriori(df, min_support=0.1, use_colnames=True)
df_support

from mlxtend.frequent_patterns import association_rules

rules = association_rules(
    df_support, metric="confidence", min_threshold=0.7)
rules

rules[rules['antecedents'] == {'Butter'}]

If someone buys butter, we're 75% confident he'll also buy jam

From scratch¶

MIN_SUPPORT = 0.1

df_support = pd.DataFrame(columns=['support', 'itemsets'])
n = len(df)

# For each item, count the number of times they appear
for col in df.columns:
    support = df[col].sum()/n

    if support > MIN_SUPPORT:
        df_support.loc[len(df_support)] = [
            support, frozenset([col])
        ]
df_support

import itertools

# For each item that appears at least 10% of times
# Create combinations of 2
offset_1 = len(df_support)

for a, b in itertools.combinations(df_support['itemsets'], r=2):
    c = frozenset([*a, *b])
    print(c)

    support = df[c].all(axis=1).sum()/n

    if support > MIN_SUPPORT:
        df_support.loc[len(df_support)] = [
            support, c
        ]

df_support

frozenset({'Bread', 'Butter'})
frozenset({'Bread', 'Jam'})
frozenset({'Bread', 'Milk'})
frozenset({'Bread', 'Wheat Bread'})
frozenset({'Jam', 'Butter'})
frozenset({'Butter', 'Milk'})
frozenset({'Butter', 'Wheat Bread'})
frozenset({'Jam', 'Milk'})
frozenset({'Jam', 'Wheat Bread'})
frozenset({'Wheat Bread', 'Milk'})

# For each combinations of 2 that appears at least 10% of times
# Create combinations of 3 (by adding combinations of 1 >10%)
seen = set()

for a, b in itertools.product(
    df_support[:offset_1]['itemsets'],
    df_support[offset_1:]['itemsets']
):
    c = frozenset([*a, *b])

    # Check we've got a combination of 3
    if len(c) != 3: continue
    if c in seen: continue
    seen.add(c)

    # Check support
    print(c)
    support = df[c].all(axis=1).sum()/n

    if support > MIN_SUPPORT:
        df_support.loc[len(df_support)] = [
            support, c
        ]

seen = None
df_support

frozenset({'Bread', 'Butter', 'Jam'})
frozenset({'Bread', 'Butter', 'Milk'})
frozenset({'Bread', 'Milk', 'Jam'})
frozenset({'Bread', 'Wheat Bread', 'Milk'})
frozenset({'Bread', 'Butter', 'Wheat Bread'})
frozenset({'Jam', 'Butter', 'Milk'})
frozenset({'Butter', 'Wheat Bread', 'Milk'})
frozenset({'Jam', 'Bread', 'Wheat Bread'})
frozenset({'Jam', 'Wheat Bread', 'Milk'})
frozenset({'Jam', 'Butter', 'Wheat Bread'})

df_support['length'] = df_support['itemsets'].apply(lambda x: len(x))
df_support

df_support[df_support['length'] == 2]

	support	itemsets
0	0.50	(0)
1	0.75	(1)
2	0.75	(2)
3	0.50	(3)
4	0.25	(4)
5	0.25	(0, 1)
6	0.25	(0, 2)
7	0.25	(0, 3)
8	0.25	(0, 4)
9	0.75	(1, 2)
10	0.25	(1, 3)
11	0.25	(2, 3)
12	0.25	(3, 4)
13	0.25	(0, 1, 2)
14	0.25	(0, 3, 4)
15	0.25	(1, 2, 3)

	support	itemsets
0	0.50	(Bread)
1	0.75	(Butter)
2	0.75	(Jam)
3	0.50	(Milk)
4	0.25	(Wheat Bread)
5	0.25	(Bread, Butter)
6	0.25	(Bread, Jam)
7	0.25	(Bread, Milk)
8	0.25	(Bread, Wheat Bread)
9	0.75	(Jam, Butter)
10	0.25	(Butter, Milk)
11	0.25	(Jam, Milk)
12	0.25	(Wheat Bread, Milk)
13	0.25	(Bread, Butter, Jam)
14	0.25	(Bread, Wheat Bread, Milk)
15	0.25	(Jam, Butter, Milk)

	antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
0	(Wheat Bread)	(Bread)	0.25	0.50	0.25	1.0	2.000000	0.1250	inf
1	(Jam)	(Butter)	0.75	0.75	0.75	1.0	1.333333	0.1875	inf
2	(Butter)	(Jam)	0.75	0.75	0.75	1.0	1.333333	0.1875	inf
3	(Wheat Bread)	(Milk)	0.25	0.50	0.25	1.0	2.000000	0.1250	inf
4	(Bread, Butter)	(Jam)	0.25	0.75	0.25	1.0	1.333333	0.0625	inf
5	(Bread, Jam)	(Butter)	0.25	0.75	0.25	1.0	1.333333	0.0625	inf
6	(Bread, Wheat Bread)	(Milk)	0.25	0.50	0.25	1.0	2.000000	0.1250	inf
7	(Bread, Milk)	(Wheat Bread)	0.25	0.25	0.25	1.0	4.000000	0.1875	inf
8	(Wheat Bread, Milk)	(Bread)	0.25	0.50	0.25	1.0	2.000000	0.1250	inf
9	(Wheat Bread)	(Bread, Milk)	0.25	0.25	0.25	1.0	4.000000	0.1875	inf
10	(Jam, Milk)	(Butter)	0.25	0.75	0.25	1.0	1.333333	0.0625	inf
11	(Butter, Milk)	(Jam)	0.25	0.75	0.25	1.0	1.333333	0.0625	inf

	support	itemsets
0	0.50	(Bread)
1	0.75	(Butter)
2	0.75	(Jam)
3	0.50	(Milk)
4	0.25	(Wheat Bread)

	support	itemsets
0	0.50	(Bread)
1	0.75	(Butter)
2	0.75	(Jam)
3	0.50	(Milk)
4	0.25	(Wheat Bread)
5	0.25	(Bread, Butter)
6	0.25	(Bread, Jam)
7	0.25	(Bread, Milk)
8	0.25	(Bread, Wheat Bread)
9	0.75	(Jam, Butter)
10	0.25	(Butter, Milk)
11	0.25	(Jam, Milk)
12	0.25	(Wheat Bread, Milk)

	Bread	Butter	Jam	Milk	Wheat Bread
0	True	False	False	True	True
1	True	True	True	False	False
2	False	True	True	True	False
3	False	True	True	False	False