Data Transformation

Adding New Columns

Simple Column Addition

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000, 60000, 55000]
})

# Add a new column
df["bonus"] = df["salary"] * 0.1
print("After adding bonus column:")
print(df)

Calculated Columns

import pandas as pd

df = pd.DataFrame({
    "price": [100, 200, 150],
    "quantity": [5, 3, 4],
    "discount": [0.1, 0.15, 0.05]
})

# Calculate total with discount
df["total"] = df["price"] * df["quantity"] * (1 - df["discount"])
print("With calculated total:")
print(df)

Using assign() Method

import pandas as pd

df = pd.DataFrame({
    "x": [1, 2, 3],
    "y": [4, 5, 6]
})

# Using assign (returns new DataFrame)
df_new = df.assign(z=lambda x: x["x"] + x["y"])
print("Using assign():")
print(df_new)

Modifying Existing Columns

In-Place Modification

import pandas as pd

df = pd.DataFrame({
    "price": [100, 200, 150],
    "quantity": [5, 3, 4]
})

# Modify existing column
df["price"] = df["price"] * 1.1  # 10% increase
print("After price increase:")
print(df)

Conditional Column Updates

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000, 60000, 55000]
})

# Update based on condition
df.loc[df["age"] > 30, "salary"] = df.loc[df["age"] > 30, "salary"] * 1.05
print("After salary increase for age > 30:")
print(df)

Applying Functions

Using apply() on Series

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
})

# Apply function to a column
df["age_group"] = df["age"].apply(lambda x: "Young" if x < 30 else "Old")
print("With age groups:")
print(df)

Using apply() on DataFrame

import pandas as pd

df = pd.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6],
    "c": [7, 8, 9]
})

# Apply function to each row
df["sum"] = df.apply(lambda row: row["a"] + row["b"] + row["c"], axis=1)
print("With row sum:")
print(df)

Using map() for Value Mapping

import pandas as pd

df = pd.DataFrame({
    "grade": ["A", "B", "C", "A", "B"]
})

# Map values
grade_map = {"A": 4.0, "B": 3.0, "C": 2.0}
df["gpa"] = df["grade"].map(grade_map)
print("With GPA mapping:")
print(df)

Using replace()

import pandas as pd

df = pd.DataFrame({
    "status": ["active", "inactive", "pending", "active"]
})

# Replace values
df["status"] = df["status"].replace({"active": 1, "inactive": 0, "pending": 2})
print("After replacement:")
print(df)

String Operations

String Methods

import pandas as pd

df = pd.DataFrame({
    "name": ["alice smith", "BOB JONES", "charlie brown"]
})

# Capitalize
df["name_capitalized"] = df["name"].str.capitalize()
print("Capitalized:")
print(df[["name", "name_capitalized"]])

# Title case
df["name_title"] = df["name"].str.title()
print("\nTitle case:")
print(df[["name", "name_title"]])

String Splitting

import pandas as pd

df = pd.DataFrame({
    "full_name": ["Alice Smith", "Bob Jones", "Charlie Brown"]
})

# Split into first and last name
df[["first_name", "last_name"]] = df["full_name"].str.split(" ", expand=True)
print("After splitting:")
print(df)

String Concatenation

import pandas as pd

df = pd.DataFrame({
    "first": ["Alice", "Bob", "Charlie"],
    "last": ["Smith", "Jones", "Brown"],
    "age": [25, 30, 35]
})

# Concatenate strings
df["full_name"] = df["first"] + " " + df["last"]
df["description"] = df["full_name"] + " is " + df["age"].astype(str) + " years old"
print("With concatenated columns:")
print(df[["full_name", "description"]])

Date Operations

Creating Date Columns

import pandas as pd

df = pd.DataFrame({
    "date_str": ["2024-01-15", "2024-02-20", "2024-03-10"]
})

# Convert to datetime
df["date"] = pd.to_datetime(df["date_str"])
print("After conversion:")
print(df)
print(f"\nDate type: {df['date'].dtype}")

Extracting Date Components

import pandas as pd

df = pd.DataFrame({
    "date": pd.date_range("2024-01-01", periods=3, freq="M")
})

# Extract components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["day_name"] = df["date"].dt.day_name()

print("With date components:")
print(df)

Sorting Data

Sorting by Column

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [35, 25, 30, 28],
    "salary": [55000, 50000, 60000, 58000]
})

# Sort by age (ascending)
df_sorted = df.sort_values("age")
print("Sorted by age:")
print(df_sorted)

# Sort by salary (descending)
df_sorted2 = df.sort_values("salary", ascending=False)
print("\nSorted by salary (descending):")
print(df_sorted2)

Sorting by Multiple Columns

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "department": ["Sales", "IT", "Sales", "IT"],
    "salary": [50000, 60000, 55000, 58000]
})

# Sort by department, then salary
df_sorted = df.sort_values(["department", "salary"], ascending=[True, False])
print("Sorted by department, then salary:")
print(df_sorted)

Reordering Columns

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["NYC", "SF", "Seattle"],
    "salary": [50000, 60000, 55000]
})

# Reorder columns
df_reordered = df[["name", "city", "age", "salary"]]
print("Reordered columns:")
print(df_reordered)

Pivoting Data

pivot() - Reshaping Data

import pandas as pd

df = pd.DataFrame({
    "date": ["2024-01-01", "2024-01-01", "2024-01-02", "2024-01-02"],
    "product": ["A", "B", "A", "B"],
    "sales": [100, 150, 120, 180]
})

# Pivot
df_pivot = df.pivot(index="date", columns="product", values="sales")
print("Pivoted data:")
print(df_pivot)

Melting Data

melt() - Unpivoting Data

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob"],
    "Q1": [100, 150],
    "Q2": [120, 180],
    "Q3": [130, 160]
})

# Melt
df_melted = df.melt(id_vars=["name"], 
                    value_vars=["Q1", "Q2", "Q3"],
                    var_name="quarter",
                    value_name="sales")
print("Melted data:")
print(df_melted)

Try It Yourself!

Practice transforming data:

import pandas as pd

df = pd.DataFrame({
    "product": ["Laptop", "Mouse", "Keyboard"],
    "price": [1200, 25, 75],
    "quantity": [5, 50, 30],
    "discount": [0.1, 0.05, 0.15]
})

# Calculate:
# 1. Revenue before discount (price * quantity)
# 2. Discount amount (price * quantity * discount)
# 3. Final revenue (revenue - discount)
# 4. Add a column indicating if product is expensive (> $100)

df["revenue_before"] = df["price"] * df["quantity"]
df["discount_amount"] = df["revenue_before"] * df["discount"]
df["final_revenue"] = df["revenue_before"] - df["discount_amount"]
df["is_expensive"] = df["price"] > 100

print(df)

Data Transformation

points

You earned points! 🎉 Keep practicing! 💪

Sign up to save your points and access 10,000+ exercises Sign up to track your progress and access 10,000+ exercises

Daily Limit Reached

You've used your exercises for today. Upgrade to Elite for unlimited practices.

🔥 exercises left — Upgrade for more!

This module requires Elite

Try the first module of this course for free, then upgrade to unlock all modules and exercises.

No exercise selected

Click "New Exercise" to begin

Generating exercise...

Loading hint...
Hint:
Syntax Syntax OK Syntax off
Score
Total

Feedback

Tip