Page 449 - Ai_V3.0_c11_flipbook
P. 449
Python code is as follows:
import pandas as pd
from google.colab import drive
# Replace 'path_to_csv_file' with the path to your CSV file on Google Drive
file_path = 'sales_data.csv'
# Load the data into a DataFrame
try:
# Load the data into a DataFrame with ',' as the separator
df = pd.read_csv(file_path, sep=',')
except Exception as e:
print("Error loading the CSV file:", e)
df = None
# Check if DataFrame is loaded successfully
if df is not None:
print("Shape of DataFrame:", df.shape)
print(df.head())
# Check for inconsistent fields
print("\nIdentifying and addressing inconsistent fields:")
for column in df.columns:
print(f"Unique values in '{column}':", df[column].unique())
# Incorrectly formatted data
print("\nIncorrectly formatted data:")
print("Data types of columns:")
print(df.dtypes)
# Replace '1_Unk' with NaN in 'age' column and convert to numeric format
df['age'] = df['age'].replace('1_Unk', None)
df['age'] = pd.to_numeric(df['age'].str.replace('_.*', ''), errors='coerce')
# Convert 'house_val' to numeric format
df['house_val'] = pd.to_numeric(df['house_val'], errors='coerce')
# Check for missing values
print("\nIdentifying missing values:")
print("Missing values count per column:")
print(df.isnull().sum())
# Impute missing values for 'age' with the median age
df['age'].fillna(df['age'].median(), inplace=True)
# Optionally, you can also impute other missing values as needed
# Example: Impute missing 'marriage' values with the most frequent value
df['marriage'].fillna(df['marriage'].mode()[0], inplace=True)
# Document the cleaning process
cleaning_process = """ Cleaning Process:
1. Identified and addressed inconsistent fields by checking unique values in
each column.
2. Checked for incorrectly formatted data and converted 'house_val' and 'age'
to numeric format.
3. Replaced '1_Unk' in 'age' with NaN and imputed missing 'age' values with
the median.
4. Optionally imputed other missing values as needed."""
Projects 447

