import pandas as pd
The store sells 5 items:
We have have 4 transactions. In the first one, 3 products were purchased: Milk, Bread and Wheat Bread.
data = [
['Milk', 'Bread', 'Wheat Bread'],
['Jam', 'Bread', 'Butter'],
['Milk', 'Jam', 'Butter'],
['Jam', 'Butter']
]
data
from mlxtend.preprocessing import TransactionEncoder
encoder = TransactionEncoder()
df = pd.DataFrame(
encoder.fit_transform(data),
columns=encoder.columns_
)
df
from mlxtend.frequent_patterns import apriori
df_support = apriori(df, min_support=0.1)
df_support
df_support = apriori(df, min_support=0.1, use_colnames=True)
df_support
from mlxtend.frequent_patterns import association_rules
rules = association_rules(
df_support, metric="confidence", min_threshold=0.7)
rules
rules[rules['antecedents'] == {'Butter'}]
If someone buys butter, we're 75% confident he'll also buy jam
MIN_SUPPORT = 0.1
df_support = pd.DataFrame(columns=['support', 'itemsets'])
n = len(df)
# For each item, count the number of times they appear
for col in df.columns:
support = df[col].sum()/n
if support > MIN_SUPPORT:
df_support.loc[len(df_support)] = [
support, frozenset([col])
]
df_support
import itertools
# For each item that appears at least 10% of times
# Create combinations of 2
offset_1 = len(df_support)
for a, b in itertools.combinations(df_support['itemsets'], r=2):
c = frozenset([*a, *b])
print(c)
support = df[c].all(axis=1).sum()/n
if support > MIN_SUPPORT:
df_support.loc[len(df_support)] = [
support, c
]
df_support
# For each combinations of 2 that appears at least 10% of times
# Create combinations of 3 (by adding combinations of 1 >10%)
seen = set()
for a, b in itertools.product(
df_support[:offset_1]['itemsets'],
df_support[offset_1:]['itemsets']
):
c = frozenset([*a, *b])
# Check we've got a combination of 3
if len(c) != 3: continue
if c in seen: continue
seen.add(c)
# Check support
print(c)
support = df[c].all(axis=1).sum()/n
if support > MIN_SUPPORT:
df_support.loc[len(df_support)] = [
support, c
]
seen = None
df_support
df_support['length'] = df_support['itemsets'].apply(lambda x: len(x))
df_support
df_support[df_support['length'] == 2]