# Import the libraries you'll need for this analysis
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets
# Note: These files use semicolon (;) as the separator instead of comma

orders         = pd.read_csv('datasets/instacart_orders.csv', sep=';')
products       = pd.read_csv('datasets/products.csv', sep=';')
departments    = pd.read_csv('datasets/departments.csv', sep=';')
aisles         = pd.read_csv('datasets/aisles.csv', sep=';')
order_products = pd.read_csv('datasets/order_products.csv', sep=';')

# Display orders dataset
orders

# Display products dataset
products

departments

aisles

order_products

# Analyze orders dataset structure and completeness
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478967 entries, 0 to 478966
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                478967 non-null  int64  
 1   user_id                 478967 non-null  int64  
 2   order_number            478967 non-null  int64  
 3   order_dow               478967 non-null  int64  
 4   order_hour_of_day       478967 non-null  int64  
 5   days_since_prior_order  450148 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 21.9 MB

# Analyze order_products dataset (using show_counts=True for large dataset)
order_products.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4545007 entries, 0 to 4545006
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   order_id           4545007 non-null  int64  
 1   product_id         4545007 non-null  int64  
 2   add_to_cart_order  4544171 non-null  float64
 3   reordered          4545007 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 138.7 MB

products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49694 entries, 0 to 49693
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_id     49694 non-null  int64 
 1   product_name   48436 non-null  object
 2   aisle_id       49694 non-null  int64 
 3   department_id  49694 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB

departments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   department_id  21 non-null     int64 
 1   department     21 non-null     object
dtypes: int64(1), object(1)
memory usage: 468.0+ bytes

aisles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   aisle_id  134 non-null    int64 
 1   aisle     134 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.2+ KB

# Display rows where the product_name column has missing values
products[products['product_name'].isna()]

# Combine conditions to check for missing product names in aisles other than 100
products[(products['product_name'].isna()) & (products['aisle_id'] != 100)].sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: object

# Combine conditions to check for missing product names in departments other than 21
products[(products['product_name'].isna()) & (products['department_id'] != 21)].sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: object

# What is this aisle and department?
print(aisles[aisles['aisle_id'] == 100])
print(departments[departments['department_id'] == 21])

    aisle_id    aisle
99       100  missing
    department_id department
20             21    missing

# Fill missing product names with 'Unknown'
products['product_name'] = products['product_name'].fillna('Unknown')

print(products.isnull().sum()) # Check for any missing values
products[products['product_name'] == 'Unknown'] # Check replacement of NaN with Unknown

product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64

# Display rows where the days_since_prior_order column has missing values
orders[orders['days_since_prior_order'].isna()]

# Are there any missing values where it's not a customer's first order?
orders[orders['days_since_prior_order'].isna() & (orders['order_number'] != 1)].sum()

order_id                  0.0
user_id                   0.0
order_number              0.0
order_dow                 0.0
order_hour_of_day         0.0
days_since_prior_order    0.0
dtype: float64

# Display rows where the add_to_cart_order column has missing values
order_products[order_products['add_to_cart_order'].isna()]

# Use .min() and .max() to find the minimum and maximum values for this column.
print('Min:', order_products['add_to_cart_order'].min())
print('Max:', order_products['add_to_cart_order'].max())

Min: 1.0
Max: 64.0

# Save all order IDs with at least one missing value in 'add_to_cart_order'
orders_with_missing = order_products[order_products['add_to_cart_order'].isna()]['order_id'].unique()

# Do all orders with missing values have more than 64 products?
order_counts = order_products.groupby('order_id')['product_id'].count() # Count the products within each order id
orders_with_missing_counts = order_counts[orders_with_missing] # Filter to only include order ids that have a missing value in 'add_to_cart_order'
print(orders_with_missing_counts)
print('Min products:', orders_with_missing_counts.min()) # Find the min number or products within those orders

order_id
2449164    76
1968313    80
2926893    92
1717990    86
1959075    98
           ..
9310       65
2170451    65
2979697    66
1625713    66
1529171    66
Name: product_id, Length: 70, dtype: int64
Min products: 65

# Replace missing values with 999 and convert column to integer type
order_products['add_to_cart_order'] = order_products['add_to_cart_order'].fillna(999).astype(int)

print(order_products.info()) # Check type change to int
order_products[order_products['add_to_cart_order'] == 999] # Check NaN was replaced with 999

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4545007 entries, 0 to 4545006
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 138.7 MB
None

# Find the number of duplicate rows in the orders dataframe
print('Number of duplicate rows:', orders.duplicated().sum())

Number of duplicate rows: 15

# View the duplicate rows
orders[orders.duplicated()]

# Remove duplicate orders
orders = orders.drop_duplicates().reset_index(drop=True)

# Double check for duplicate rows
print('Number of duplicate rows:', orders.duplicated().sum())

Number of duplicate rows: 0

# Check for fully duplicate rows
print('Number of fully duplicate rows:', products.duplicated().sum())

Number of fully duplicate rows: 0

# Check for just duplicate product IDs using subset='product_id' in duplicated()
print('Number of duplicate product IDs:', products.duplicated(subset='product_id').sum())

Number of duplicate product IDs: 0

# Check for just duplicate product names (convert names to lowercase to compare better)
print('Number of duplicate product names:', products['product_name'].str.lower().duplicated().sum())

Number of duplicate product names: 1361

products[products['product_name'].str.lower() == 'high performance energy drink']

# Drop duplicate product names (case insensitive)
products['product_name'] = products['product_name'].str.lower() # Mutate products dataframe to all lowercase product names
products_before_drop = products # Save products dataframe before dropped rows for use in section C2 and C3 later in analysis
products = products.drop_duplicates(subset='product_name').reset_index(drop=True) # Drop rows with duplicate product names and save
print('Number of duplicate product names:', products['product_name'].duplicated().sum()) # Check if duplicates were dropped

Number of duplicate product names: 0

# Check for duplicate entries in the departments dataframe
print('Number of duplicate entries in the departments dataframe:', departments.duplicated(subset='department').sum())

Number of duplicate entries in the departments dataframe: 0

# Check for aisles entries in the departments dataframe
print('Number of aisles entries in the departments dataframe:', aisles['aisle'].isin(departments['department']).sum())
aisles[aisles['aisle'].isin(departments['department'])]

Number of aisles entries in the departments dataframe: 2

# Check for duplicate entries in the aisles dataframe
print('Number of duplicate entries in the aisles dataframe:', aisles.duplicated(subset='aisle').sum())

Number of duplicate entries in the aisles dataframe: 0

# Check for duplicate entries in the order_products dataframe
print('Number of duplicate entries in the order_products dataframe:', order_products.duplicated().sum())

Number of duplicate entries in the order_products dataframe: 0

# Verify order_hour_of_day ranges from 0 to 23
order_hour_of_day = sorted(orders['order_hour_of_day'].unique())
print('order_hour_of_day range:', min(order_hour_of_day), '-', max(order_hour_of_day))

order_hour_of_day range: 0 - 23

# Verify order_dow ranges from 0 to 6
order_dow = sorted(orders['order_dow'].unique())
print('order_dow range:', min(order_dow), '-', max(order_dow))

order_dow range: 0 - 6

# What time of day do people shop for groceries?
shopping_time = orders['order_hour_of_day'].value_counts().sort_index()

# Visualize with a bar plot
shopping_time.plot(kind='bar', xlabel='Hour of day', ylabel='Number of orders', title='Orders by hour of day', rot=0, figsize=(14,7))
plt.show()

# What day of the week do people shop for groceries?
shopping_day = orders['order_dow'].value_counts().sort_index()

# Visualize with a bar plot
shopping_day.plot(kind='bar', xlabel='Day of the week', ylabel='Number of orders', title='Orders by day of the week', rot=0)
plt.show()

# How long do people wait until placing another order?
order_duration = orders['days_since_prior_order'].value_counts().sort_index()

# Visualize with a bar plot
order_duration.plot(kind='bar', xlabel='Days since prior order', ylabel='Number of orders', title='Distribution of days between orders', rot=0, figsize=(14,7))
plt.show()

# Create masks for Wednesday (3) and Saturday (6)
wednesday_orders = orders[orders['order_dow'] == 3]
saturday_orders = orders[orders['order_dow'] == 6]

# Count order hours for Wednesday and Saturday
wednesday_order_hours = wednesday_orders['order_hour_of_day'].value_counts().sort_index()
saturday_order_hours = saturday_orders['order_hour_of_day'].value_counts().sort_index()

# Combine into a single DataFrame
wed_sat_compare = pd.concat([wednesday_order_hours, saturday_order_hours], axis=1, keys=['Wednesday', 'Saturday'])
wed_sat_compare

# Plot bar charts for both days
wed_sat_compare.plot(kind='bar', xlabel='Hour of the day', ylabel='Number of orders', title='Wednesday/Saturday shopping hours', rot=0, figsize=(14,7))
plt.show()

# Group by user_id and count orders
orders_per_customer = orders.groupby('user_id')['order_id'].count().sort_values()

# Visualize with a histogram
plt.hist(orders_per_customer, bins=28)
plt.xlabel('Orders per customer')
plt.ylabel('Number of customers')
plt.title('Distribution of orders per customer')
plt.show()

# Merge order_products with products to get product names
products_merge = order_products.merge(products, on='product_id')

# Group by product_id and product_name, count occurrences
product_popularity = products_merge.groupby(['product_id', 'product_name']).size().sort_values(ascending=False).head(20)

# Visualize the top 20 products
product_popularity.plot(kind='barh') # Use a horizontal bar chart to display products as a list
plt.gca().invert_yaxis() # List most popular products from top down
plt.title('Most popular products')
plt.xlabel('Order count')
plt.ylabel('Top 20 products')
plt.show()

# Display as a table
product_popularity_df = product_popularity.reset_index() # Reset index to default and create new column for existing index
product_popularity_df.columns = ['product_id', 'product_name', 'order_count'] # Label all columns
product_popularity_df

# Group by order_id and count items in each order
items_per_order = order_products.groupby('order_id')['product_id'].count()
print('Items per order Min:', items_per_order.min(), '\nItems per order Max:', items_per_order.max())
# Count how frequently each order size occurs
order_size_distribution = items_per_order.value_counts().sort_index()
print(order_size_distribution.head(20))

Items per order Min: 1 
Items per order Max: 127
product_id
1     21847
2     26292
3     29046
4     31054
5     31923
6     31698
7     30822
8     28539
9     25742
10    23248
11    20406
12    18539
13    16497
14    14472
15    12696
16    11465
17    10002
18     8726
19     7612
20     6771
Name: count, dtype: int64

# Visualize the full distribution
order_size_distribution.plot(kind="bar", xlabel='Number of items', ylabel='Number of orders', title='Distribution of items per order', figsize=(14,7))
plt.show()

# Plot orders with fewer than 35 items for better visualization
order_size_distribution.plot(kind="bar", xlabel='Number of items', ylabel='Number of orders', title='Distribution of items per order (35 items or less)', figsize=(14,7))
plt.xlim(0,34)
plt.show()

# Filter for reordered items only
reordered_items = order_products[order_products['reordered'] == 1]

# Merge with products to get product names
reordered_with_names = reordered_items.merge(products, on='product_id')

# Group by product and count reorders
top_reordered = reordered_with_names.groupby(['product_id', 'product_name']).size().sort_values(ascending=False).head(20)
print(top_reordered)

product_id  product_name            
24852       banana                      55763
13176       bag of organic bananas      44450
21137       organic strawberries        28639
21903       organic baby spinach        26233
47209       organic hass avocado        23629
47766       organic avocado             18743
27845       organic whole milk          16251
47626       large lemon                 15044
27966       organic raspberries         14748
16797       strawberries                13945
26209       limes                       13327
22935       organic yellow onion        11145
24964       organic garlic              10411
45007       organic zucchini            10076
49683       cucumber kirby               9538
28204       organic fuji apple           8989
8277        apple honeycrisp organic     8836
39275       organic blueberries          8799
5876        organic lemon                8412
49235       organic half & half          8389
dtype: int64

# Merge with products df saved before the dropped duplicate rows to ensure all product ids are accounted for
reordered_with_names = reordered_items.merge(products_before_drop, on='product_id') 

# Show some products have multiple IDs
id_per_product = reordered_with_names.groupby(['product_name'])['product_id'].nunique().sort_values(ascending=False).head(10)
print('IDs per product shows some products with multiple IDs:\n', id_per_product)

# Group by product and count reorders
top_reordered = reordered_with_names.groupby(['product_name']).size().sort_values(ascending=False).head(20) # Group only by product_name to count multiply product_ids with the same product_name together
print()
print('This list groups only by product name to include all IDs for each product:\n', top_reordered)

IDs per product shows some products with multiple IDs:
 product_name
unknown                                  508
green tea with ginseng and honey           3
chopped garlic in water                    2
spring water body wash                     2
cream of tartar                            2
cream of mushroom soup                     2
aged balsamic vinegar of modena            2
organic balsamic vinegar of modena         2
family size lasagna with meat & sauce      2
cream of celery condensed soup             2
Name: product_id, dtype: int64

This list groups only by product name to include all IDs for each product:
 product_name
banana                      55763
bag of organic bananas      44450
organic strawberries        28639
organic baby spinach        26233
organic hass avocado        23629
organic avocado             18743
organic whole milk          16251
large lemon                 15044
organic raspberries         14748
strawberries                13945
limes                       13327
organic yellow onion        11145
organic garlic              10411
organic zucchini            10076
cucumber kirby               9538
organic fuji apple           8989
apple honeycrisp organic     8836
organic blueberries          8799
organic lemon                8412
organic half & half          8389
dtype: int64

# Visualize top 20 reordered items
top_reordered.plot(kind='barh') # Use horizontal bar chart to diplay items list
plt.gca().invert_yaxis() # List the items with most reordered at the top
plt.xlabel('Number of reorders')
plt.ylabel('Top reordered products')
plt.title('Top 20 most frequently reordered prodcuts')
plt.show()

# Display as a table
top_reordered_df = top_reordered.reset_index() # Reset the index to default and create a new column for the existing index
top_reordered_df.columns = ['product_name', 'reorder_count'] # Label the column names
top_reordered_df

# Merge order_products with products
products_with_names = order_products.merge(products, on='product_id')

# Group by product and calculate mean of reordered column
reorder_proportion = products_with_names.groupby(['product_id', 'product_name'])['reordered'].mean().sort_values(ascending=False)

# Convert to DataFrame and display top products
reorder_proportion_df = reorder_proportion.reset_index()
reorder_proportion_df.columns = ['product_id', 'product_name', 'reorder_proportion']
reorder_proportion_df.head(20)

# Merge with products dataframe saved before the dropped duplicate rows to ensure all product ids are accounted for
products_with_names = order_products.merge(products_before_drop, on='product_id')

# Group by product and calculate mean of reordered column
reorder_proportion = products_with_names.groupby(['product_name'])['reordered'].mean().sort_values(ascending=False)

# Convert to DataFrame and display top products
reorder_proportion_df = reorder_proportion.reset_index()
reorder_proportion_df.columns = ['product_name', 'reorder_proportion']
reorder_proportion_df.head(20)

# Display products sorted by product_name
reorder_proportion_df.sort_values('product_name').head(20)

# Show summary statistics
print('Summary statistics for products reorder proportion:\n', reorder_proportion_df['reorder_proportion'].describe())

Summary statistics for products reorder proportion:
 count    44518.000000
mean         0.389485
std          0.270599
min          0.000000
25%          0.171645
50%          0.413043
75%          0.578425
max          1.000000
Name: reorder_proportion, dtype: float64

# Merge order_products with orders to get customer information
customer_orders = order_products.merge(orders, on='order_id')

# Group by user_id and calculate mean of reordered column
customer_reorder_proportion = customer_orders.groupby('user_id')['reordered'].mean().sort_values(ascending=False)

# Convert to DataFrame
customer_reorder_df = customer_reorder_proportion.reset_index()
customer_reorder_df.columns = ['user_id', 'reorder_proportion']
customer_reorder_df.head(20)

# Show summary statistics for customer reorder proportions
print('Summary statistics for customer reorder proportion:\n', customer_reorder_df['reorder_proportion'].describe())

# Visualize the distribution
plt.hist(customer_reorder_df['reorder_proportion'], bins=50)
plt.xlabel('Proportion of products reordered')
plt.ylabel('Number of customers')
plt.title('Distribution of reorder proportion by customer')
plt.show()

Summary statistics for customer reorder proportion:
 count    149626.000000
mean          0.494853
std           0.292685
min           0.000000
25%           0.272727
50%           0.500000
75%           0.724138
max           1.000000
Name: reorder_proportion, dtype: float64

# Merge order_products with products
first_items = order_products[order_products['add_to_cart_order'] == 1].merge(products, on='product_id')

# Group by product and count occurrences
top_first_items = first_items.groupby(['product_id', 'product_name']).size().sort_values(ascending=False).head(20)
print(top_first_items)

product_id  product_name               
24852       banana                         15562
13176       bag of organic bananas         11026
27845       organic whole milk              4363
21137       organic strawberries            3946
47209       organic hass avocado            3390
21903       organic baby spinach            3336
47766       organic avocado                 3044
19660       spring water                    2336
16797       strawberries                    2308
27966       organic raspberries             2024
44632       sparkling water grapefruit      1914
49235       organic half & half             1797
47626       large lemon                     1737
196         soda                            1733
38689       organic reduced fat milk        1397
26209       limes                           1370
12341       hass avocados                   1340
5785        organic reduced fat 2% milk     1310
27086       half & half                     1309
43352       raspberries                     1246
dtype: int64

# Visualize top 20 first-in-cart items
top_first_items.plot(kind='barh')
plt.gca().invert_yaxis()
plt.xlabel('Number of times added first')
plt.ylabel('Product name')
plt.title('Top 20 products added to cart first')
plt.show()

# Display as a table
top_first_items_df = top_first_items.reset_index()
top_first_items_df.columns = ['product_id', 'product_name', 'first_in_cart_count']
top_first_items_df

	order_id	user_id	order_number	order_dow	order_hour_of_day	days_since_prior_order
0	1515936	183418	11	6	13	30.0
1	1690866	163593	5	5	12	9.0
2	1454967	39980	4	5	19	2.0
3	1768857	82516	56	0	20	10.0
4	3007858	196724	2	4	12	17.0
...	...	...	...	...	...	...
478962	3210681	5617	5	1	14	7.0
478963	3270802	112087	2	3	13	6.0
478964	885349	82944	16	2	11	6.0
478965	216274	4391	3	3	8	8.0
478966	2071924	1730	18	1	14	15.0

	aisle_id	aisle
0	1	prepared soups salads
1	2	specialty cheeses
2	3	energy granola bars
3	4	instant foods
4	5	marinades meat preparation
...	...	...
129	130	hot cereal pancake mixes
130	131	dry pasta
131	132	beauty
132	133	muscles joints pain relief
133	134	specialty wines champagnes

	order_id	product_id	add_to_cart_order	reordered
0	2141543	11440	17.0	0
1	567889	1560	1.0	1
2	2261212	26683	1.0	1
3	491251	8670	35.0	1
4	2571142	1940	5.0	1
...	...	...	...	...
4545002	577211	15290	12.0	1
4545003	1219554	21914	9.0	0
4545004	692640	47766	4.0	1
4545005	319435	691	8.0	1
4545006	1398151	28733	9.0	0

	order_id	user_id	order_number	order_dow	order_hour_of_day	days_since_prior_order
28	133707	182261	1	3	10	NaN
96	787445	25685	1	6	18	NaN
100	294410	111449	1	0	19	NaN
103	2869915	123958	1	4	16	NaN
104	2521921	42286	1	3	18	NaN
...	...	...	...	...	...	...
478895	2589657	205028	1	0	16	NaN
478896	2222353	141211	1	2	13	NaN
478922	2272807	204154	1	1	15	NaN
478926	2499542	68810	1	4	19	NaN
478945	1387033	22496	1	5	14	NaN

	order_id	product_id	add_to_cart_order	reordered
737	2449164	5068	NaN	0
9926	1968313	43867	NaN	0
14394	2926893	11688	NaN	0
16418	1717990	4142	NaN	0
30114	1959075	42828	NaN	1
...	...	...	...	...
4505662	1800005	7411	NaN	0
4511400	1633337	260	NaN	0
4517562	404157	9517	NaN	0
4534112	1673227	17835	NaN	0
4535739	1832957	17949	NaN	1

	product_id	product_name	aisle_id	department_id
0	1	Chocolate Sandwich Cookies	61	19
1	2	All-Seasons Salt	104	13
2	3	Robust Golden Unsweetened Oolong Tea	94	7
3	4	Smart Ones Classic Favorites Mini Rigatoni Wit...	38	1
4	5	Green Chile Anytime Sauce	5	13
...	...	...	...	...
49689	49690	HIGH PERFORMANCE ENERGY DRINK	64	7
49690	49691	ORIGINAL PANCAKE & WAFFLE MIX	130	14
49691	49692	ORGANIC INSTANT OATMEAL LIGHT MAPLE BROWN SUGAR	130	14
49692	49693	SPRING WATER BODY WASH	127	11
49693	49694	BURRITO- STEAK & CHEESE	38	1

	department_id	department
0	1	frozen
1	2	other
2	3	bakery
3	4	produce
4	5	alcohol
5	6	international
6	7	beverages
7	8	pets
8	9	dry goods pasta
9	10	bulk
10	11	personal care
11	12	meat seafood
12	13	pantry
13	14	breakfast
14	15	canned goods
15	16	dairy eggs
16	17	household
17	18	babies
18	19	snacks
19	20	deli
20	21	missing

	product_id	product_name	aisle_id	department_id
37	38	NaN	100	21
71	72	NaN	100	21
109	110	NaN	100	21
296	297	NaN	100	21
416	417	NaN	100	21
...	...	...	...	...
49552	49553	NaN	100	21
49574	49575	NaN	100	21
49640	49641	NaN	100	21
49663	49664	NaN	100	21
49668	49669	NaN	100	21

	product_id	product_name	aisle_id	department_id
37	38	Unknown	100	21
71	72	Unknown	100	21
109	110	Unknown	100	21
296	297	Unknown	100	21
416	417	Unknown	100	21
...	...	...	...	...
49552	49553	Unknown	100	21
49574	49575	Unknown	100	21
49640	49641	Unknown	100	21
49663	49664	Unknown	100	21
49668	49669	Unknown	100	21

	order_id	user_id	order_number	order_dow	order_hour_of_day	days_since_prior_order
145574	794638	50898	24	3	2	2.0
223105	2160484	107525	16	3	2	30.0
230807	1918001	188546	14	3	2	16.0
266232	1782114	106752	1	3	2	NaN
273805	1112182	202304	84	3	2	6.0
284038	2845099	31189	11	3	2	7.0
311713	1021560	53767	3	3	2	9.0
321100	408114	68324	4	3	2	18.0
323900	1919531	191501	32	3	2	7.0
345917	2232988	82565	1	3	2	NaN
371905	391768	57671	19	3	2	10.0
394347	467134	63189	21	3	2	2.0
411408	1286742	183220	48	3	2	4.0
415163	2282673	86751	49	3	2	2.0
441599	2125197	14050	48	3	2	3.0

	product_id	product_name	aisle_id	department_id
22540	22541	High Performance Energy Drink	64	7
49689	49690	HIGH PERFORMANCE ENERGY DRINK	64	7

	Wednesday	Saturday
order_hour_of_day
0	373	464
1	215	254
2	106	177
3	101	125
4	108	118
5	170	161
6	643	451
7	1732	1619
8	3125	3246
9	4490	4311
10	5026	4919
11	5004	5116
12	4688	5132
13	4674	5323
14	4774	5375
15	5163	5188
16	4976	5029
17	4175	4295
18	3463	3338
19	2652	2610
20	1917	1847
21	1450	1473
22	1154	1185
23	718	893

	product_id	product_name	order_count
0	24852	banana	66050
1	13176	bag of organic bananas	53297
2	21137	organic strawberries	37039
3	21903	organic baby spinach	33971
4	47209	organic hass avocado	29773
5	47766	organic avocado	24689
6	47626	large lemon	21495
7	16797	strawberries	20018
8	26209	limes	19690
9	27845	organic whole milk	19600
10	27966	organic raspberries	19197
11	22935	organic yellow onion	15898
12	24964	organic garlic	15292
13	45007	organic zucchini	14584
14	39275	organic blueberries	13879
15	49683	cucumber kirby	13675
16	28204	organic fuji apple	12544
17	5876	organic lemon	12232
18	8277	apple honeycrisp organic	11993
19	40706	organic grape tomatoes	11781

	product_name	reorder_count
0	banana	55763
1	bag of organic bananas	44450
2	organic strawberries	28639
3	organic baby spinach	26233
4	organic hass avocado	23629
5	organic avocado	18743
6	organic whole milk	16251
7	large lemon	15044
8	organic raspberries	14748
9	strawberries	13945
10	limes	13327
11	organic yellow onion	11145
12	organic garlic	10411
13	organic zucchini	10076
14	cucumber kirby	9538
15	organic fuji apple	8989
16	apple honeycrisp organic	8836
17	organic blueberries	8799
18	organic lemon	8412
19	organic half & half	8389

	product_id	product_name	reorder_proportion
0	49431	pretzels- mighty minis	1.0
1	49662	bacon cheddar pretzel pieces	1.0
2	6409	100% juice cranberry	1.0
3	6433	raw veggie wrappers	1.0
4	49432	bonbon bar	1.0
5	49661	porto	1.0
6	6490	vitamin d3 1000 iu in extra virgin olive oil s...	1.0
7	49657	cabernet tomatoes	1.0
8	6530	organic popped corn simply salted	1.0
9	49639	pecans- maple- premium blend	1.0
10	6189	meat snacks sriracha beef jerky	1.0
11	6389	freshly squeezed lemonade	1.0
12	49530	ground sesame tahina	1.0
13	49589	spray red honeysuckle nectar air freshener	1.0
14	49601	pomegranate gummy bears	1.0
15	49619	opo squash	1.0
16	49625	golden wheat deep cleanse shampoo	1.0
17	49430	cookie con amore jelly cookies	1.0
18	26555	raspberry sorbet with dark chocolate bars	1.0
19	26625	gravy- country-style sausage	1.0

	product_name	reorder_proportion
0	echo caviar	1.0
1	zinfandel dry creek	1.0
2	(70% juice!) mountain raspberry juice squeeze	1.0
3	apple butter spread	1.0
4	apple berry sauce on the go pouches	1.0
5	appetizers for cats steamed wild alaskan salmon	1.0
6	steamables golden potatoes	1.0
7	steamfresh garlic parm peas	1.0
8	steamfresh protein italian style	1.0
9	steel cut with protein maple & brown sugar oat...	1.0
10	dual scented oil refill- hawaiian aloha	1.0
11	dusk deodorant milled bar soap	1.0
12	dry sake	1.0
13	zzzquil vanilla cherry nighttime liquid sleep aid	1.0
14	sidekicks strawberry shake	1.0
15	sierra cedar incense	1.0
16	super fruit	1.0
17	super grains tabbouleh	1.0
18	alpo gravy cravers variety pack dog food	1.0
19	ezekial 4:9 organic sprouted blueberry waffles	1.0

	product_name	reorder_proportion
30018	#2 coffee filters	0.254545
41396	#2 cone white coffee filters	0.000000
41387	#2 mechanical pencils	0.000000
25048	#4 natural brown coffee filters	0.358974
19457	& go! hazelnut spread + pretzel sticks	0.466667
2	(70% juice!) mountain raspberry juice squeeze	1.000000
21360	+energy black cherry vegetable & fruit juice	0.428571
10813	0 calorie acai raspberry water beverage	0.583333
2049	0 calorie fuji apple pear water beverage	0.833333
2672	0 calorie strawberry dragonfruit water beverage	0.782609
7917	0% fat black cherry greek yogurt y	0.636364
8630	0% fat blueberry greek yogurt	0.623377
4984	0% fat free organic milk	0.694301
14090	0% fat greek yogurt black cherry on the bottom	0.526316
5245	0% fat greek yogurt vanilla	0.687500
6930	0% fat organic greek vanilla yogurt	0.665339
23753	0% fat peach greek yogurt	0.388889
2806	0% fat strawberry greek yogurt	0.775510
11062	0% fat superfruits greek yogurt	0.579439
8157	0% fat vanilla greek yogurt	0.631579

	user_id	reorder_proportion
0	206164	1.0
1	17973	1.0
2	196795	1.0
3	196792	1.0
4	196791	1.0
5	18048	1.0
6	17849	1.0
7	196759	1.0
8	135680	1.0
9	136211	1.0
10	136206	1.0
11	135720	1.0
12	135804	1.0
13	135800	1.0
14	135797	1.0
15	136156	1.0
16	135825	1.0
17	88	1.0
18	135820	1.0
19	136294	1.0

	product_id	product_name	first_in_cart_count
0	24852	banana	15562
1	13176	bag of organic bananas	11026
2	27845	organic whole milk	4363
3	21137	organic strawberries	3946
4	47209	organic hass avocado	3390
5	21903	organic baby spinach	3336
6	47766	organic avocado	3044
7	19660	spring water	2336
8	16797	strawberries	2308
9	27966	organic raspberries	2024
10	44632	sparkling water grapefruit	1914
11	49235	organic half & half	1797
12	47626	large lemon	1737
13	196	soda	1733
14	38689	organic reduced fat milk	1397
15	26209	limes	1370
16	12341	hass avocados	1340
17	5785	organic reduced fat 2% milk	1310
18	27086	half & half	1309
19	43352	raspberries	1246

Instacart Market Basket Analysis¶

Project Overview¶

Objectives¶

Dataset¶

Key Findings¶

Instacart Market Basket Analysis¶

Importing the required libraries¶

Analyze your data¶

Examining All Datasets¶

Identifying Missing Values¶

Analyzing Remaining Datasets¶

Data Quality: Handling Missing Values¶

products Data Frame¶

orders data frame¶

order_products data frame¶

Data Quality: Identifying and Removing Duplicates¶

orders data frame¶

products data frame¶

departments data frame¶

aisles data frame¶

order_products data frame¶

Exploratory Data Analysis: Shopping Patterns¶

Data Validation: Time Variables¶

Shopping Patterns by Time of Day¶

Shopping Patterns by Day of Week¶

Order Frequency Analysis¶

Deep Dive: Customer Behavior Insights¶

Comparative Analysis: Weekday vs Weekend Shopping Patterns¶

Customer Engagement Analysis¶

Product Popularity Rankings¶

Advanced Analysis: Basket Composition and Reorder Behavior¶

Basket Size Analysis¶

Reorder Frequency Analysis¶

Product Reorder Rate Analysis¶

Customer Reorder Behavior Analysis¶

Cart Priority Analysis¶

Summary and Business Recommendations¶

Key Findings¶

Shopping Behavior Patterns¶

Product Insights¶

Customer Engagement¶

Business Recommendations¶

Operational Optimization¶

Marketing & Retention¶

Product Recommendations¶

Next Steps for Analysis¶

`products` Data Frame¶

`orders` data frame¶

`order_products` data frame¶

`orders` data frame¶

`products` data frame¶

`departments` data frame¶

`aisles` data frame¶

`order_products` data frame¶