# Import necessary libraries
import pandas as pd

# Read the dataset into a DataFrame
df = pd.read_csv('data/movies_and_shows.csv')

# Display the first few rows to understand the structure
df.head()

# View current column names
print("Original column names:")
print(df.columns.tolist())

Original column names:
['   name', 'Character', 'r0le', 'TITLE', '  Type', 'release Year', 'genres', 'imdb sc0re', 'imdb v0tes']

# Standardize column names: strip spaces, lowercase and replace special characters
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('0', 'o')

print("\nCleaned column names:")
print(df.columns.tolist())

Cleaned column names:
['name', 'character', 'role', 'title', 'type', 'release_year', 'genres', 'imdb_score', 'imdb_votes']

# Verify the changes
df.head()

# Display basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85579 entries, 0 to 85578
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          85579 non-null  object 
 1   character     85579 non-null  object 
 2   role          85579 non-null  object 
 3   title         85578 non-null  object 
 4   type          85579 non-null  object 
 5   release_year  85579 non-null  int64  
 6   genres        85579 non-null  object 
 7   imdb_score    80970 non-null  float64
 8   imdb_votes    80853 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 5.9+ MB

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
name               0
character          0
role               0
title              1
type               0
release_year       0
genres             0
imdb_score      4609
imdb_votes      4726
dtype: int64

# Display summary statistics
df.describe()

# Filter for movies/shows that contain 'drama' in their genres
drama_df = df[df['genres'].str.contains('drama', case=False, na=False)]

print(f"Found {len(drama_df)} entries containing 'drama'")
drama_df.head()

Found 52713 entries containing 'drama'

# Function to filter movies/shows by decade
def filter_by_decade(df, start_year):
    """
    Filter the DataFrame for movies/shows released in a specific decade.
    
    Parameters:
    - df: DataFrame to filter
    - start_year: Starting year of the decade (e.g., 1980 for 1980s)
    
    Returns:
    - Filtered DataFrame
    """
    end_year = start_year + 9
    return df[(df['release_year'] >= start_year) & (df['release_year'] <= end_year)]

# Example: Filter movies from the 1990s
movies_90s = filter_by_decade(df, 1990)
print(f"Movies/shows from the 1990s: {len(movies_90s)}")
movies_90s.head()

Movies/shows from the 1990s: 2845

# Filter for highly-rated content (IMDb score >= 8.0)
high_rated = df[df['imdb_score'] >= 8.0]

print(f"Number of highly-rated titles (>=8.0): {len(high_rated)}")
high_rated[['title', 'imdb_score', 'release_year']].drop_duplicates().sort_values('imdb_score', ascending=False).head(10)

Number of highly-rated titles (>=8.0): 6093

def get_actors_for_title(title):
    """
    Retrieve a comma-separated list of actors for a given movie/show title.
    
    Parameters:
    - title: The title of the movie or show
    
    Returns:
    - String of actor names separated by commas
    """
    # Filter for rows with the specified title and role as 'ACTOR'
    title_actors_df = df[(df['title'] == title) & (df['role'] == 'ACTOR')]
    
    # Extract the 'name' column for actor names
    actor_names = title_actors_df['name']
    
    # Combine names into a single string
    return ', '.join(actor_names)

# Example usage
print("Actors in 'Taxi Driver':")
print(get_actors_for_title("Taxi Driver"))

Actors in 'Taxi Driver':
Robert De Niro, Jodie Foster, Albert Brooks, Harvey Keitel, Cybill Shepherd, Peter Boyle, Leonard Harris, Diahnne Abbott, Gino Ardito, Martin Scorsese, Murray Moston, Richard Higgs, Bill Minkin, Bob Maroff, Victor Argo, Joe Spinell, Robinson Frank Adu, Brenda Dickson, Norman Matlock, Harry Northup, Harlan Cary Poe, Steven Prince, Peter Savage, Nicholas Shields, Ralph S. Singleton, Annie Gagen, Carson Grant, Mary-Pat Green, Debbi Morgan, Don Stroud, Copper Cunningham, Garth Avery, Nat Grant, Billie Perkins, Catherine Scorsese, Charles Scorsese, Odunlade Adekola, Ijeoma Grace Agu

def categorize_imdb_score(title):
    """
    Categorize a movie/show based on its IMDb score.
    
    Categories:
    - Excellent: >= 9.0
    - Good: 7.0 - 8.9
    - Average: 5.0 - 6.9
    - Low: < 5.0
    
    Parameters:
    - title: The title of the movie or show
    
    Returns:
    - Category string or 'Title not found'
    """
    # Filter for the row with the specified title
    imdb_scores = df[df['title'] == title]['imdb_score'].tolist()
    
    # Check if title exists
    if not imdb_scores:
        return 'Title not found'
    
    # Get the IMDb score
    imdb_score = float(imdb_scores[0])
    
    # Categorize and return
    if imdb_score >= 9.0:
        return 'Excellent'
    elif imdb_score >= 7.0:
        return 'Good'
    elif imdb_score >= 5.0:
        return 'Average'
    else:
        return 'Low'

# Test the categorization function with titles that cover all categories
test_titles = ["Breaking Bad", "Taxi Driver", "The Blue Lagoon", "Dostana", "Nonexistent Movie Title"]

for title in test_titles:
    category = categorize_imdb_score(title)
    print(f"{title}: {category}")

Breaking Bad: Excellent
Taxi Driver: Good
The Blue Lagoon: Average
Dostana: Low
Nonexistent Movie Title: Title not found

	name	Character	r0le	TITLE	Type	release Year	genres	imdb sc0re	imdb v0tes
0	Robert De Niro	Travis Bickle	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
1	Jodie Foster	Iris Steensma	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
2	Albert Brooks	Tom	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
3	Harvey Keitel	Matthew 'Sport' Higgins	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
4	Cybill Shepherd	Betsy	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0

	name	character	role	title	type	release_year	genres	imdb_score	imdb_votes
0	Robert De Niro	Travis Bickle	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
1	Jodie Foster	Iris Steensma	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
2	Albert Brooks	Tom	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
3	Harvey Keitel	Matthew 'Sport' Higgins	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
4	Cybill Shepherd	Betsy	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0

	release_year	imdb_score	imdb_votes
count	85579.000000	80970.000000	8.085300e+04
mean	2015.879994	6.425877	5.978271e+04
std	7.724668	1.122655	1.846287e+05
min	1954.000000	1.500000	5.000000e+00
25%	2015.000000	5.700000	1.266000e+03
50%	2018.000000	6.500000	5.448000e+03
75%	2021.000000	7.200000	3.360900e+04
max	2022.000000	9.500000	2.294231e+06

	name	character	role	title	type	release_year	genres	imdb_score	imdb_votes
0	Robert De Niro	Travis Bickle	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
1	Jodie Foster	Iris Steensma	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
2	Albert Brooks	Tom	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
3	Harvey Keitel	Matthew 'Sport' Higgins	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0
4	Cybill Shepherd	Betsy	ACTOR	Taxi Driver	MOVIE	1976	['drama', 'crime']	8.2	808582.0

	name	character	role	title	type	release_year	genres	imdb_score	imdb_votes
571	Ray Liotta	Henry Hill	ACTOR	GoodFellas	MOVIE	1990	['drama', 'crime']	8.7	1131681.0
572	Robert De Niro	James Conway	ACTOR	GoodFellas	MOVIE	1990	['drama', 'crime']	8.7	1131681.0
573	Joe Pesci	Tommy DeVito	ACTOR	GoodFellas	MOVIE	1990	['drama', 'crime']	8.7	1131681.0
574	Lorraine Bracco	Karen Hill	ACTOR	GoodFellas	MOVIE	1990	['drama', 'crime']	8.7	1131681.0
575	Paul Sorvino	Paul Cicero	ACTOR	GoodFellas	MOVIE	1990	['drama', 'crime']	8.7	1131681.0

Movies and Shows Data Analysis¶

Project Overview¶

Dataset Description¶

1. Setup and Data Loading¶

2. Data Cleaning¶

3. Data Exploration¶

4. Filtering and Analysis¶

4.1 Filtering by Genre¶

4.2 Filtering by Decade¶

4.3 High-Rated Movies¶

5. Custom Functions¶

5.1 Get Actors for a Title¶

5.2 Categorize Movies by IMDb Score¶

6. Summary and Insights¶

Key Findings:¶

Potential Next Steps:¶

	title	imdb_score	release_year
4456	Breaking Bad	9.5	2008
4872	Avatar: The Last Airbender	9.3	2005
46185	Our Planet	9.3	2019
22523	Reply 1988	9.2	2015
65401	Major	9.1	2022
30885	My Mister	9.1	2018
51965	Kota Factory	9.1	2019
44207	The Last Dance	9.1	2020
20795	Leah Remini: Scientology and the Aftermath	9.0	2016
15764	Attack on Titan	9.0	2013