Page 450 - AI Ver 3.0 Class 11
P. 450

# Present initial findings from the cleaned dataset
                       print("\nInitial Findings after Cleaning:")
                       print("Number of records after cleaning:", len(df))
                       print("Summary statistics for numerical columns:")
                       print(df.describe())
                       print("Summary statistics for categorical columns:")
                       print(df.describe(include=['object']))
                   else:
                       print("DataFrame not loaded successfully.")


                   from sklearn.model_selection import train_test_split
                   from sklearn.preprocessing import OneHotEncoder, StandardScaler
                   from sklearn.compose import ColumnTransformer
                   from sklearn.impute import SimpleImputer
                   from sklearn.pipeline import Pipeline
                   from sklearn.ensemble import RandomForestClassifier
                   from sklearn.svm import SVC
                   from sklearn.metrics import accuracy_score
                   # Check if DataFrame is loaded successfully
                   if df is not None and not df.empty:
                       # Splitting the dataset into features (X) and the target variable (y)
                       X = df.drop(columns=['flag'])  # Features
                       y = df['flag']  # Target variable
                       # Splitting the dataset into training and test sets (80% train, 20% test)
                        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_
                        state=42)
                       # Define preprocessing steps for numerical and categorical features
                       numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
                       categorical_features = X_train.select_dtypes(include=['object']).columns
                       numeric_transformer = Pipeline(steps=[
                           ('imputer', SimpleImputer(strategy='median')),
                           ('scaler', StandardScaler())])
                       categorical_transformer = Pipeline(steps=[
                            ('imputer',  SimpleImputer(strategy='most_frequent')),    #  Impute  missing
                            values with most frequent value
                            ('onehot',  OneHotEncoder(handle_unknown='ignore'))          #  One-hot  encode
                            categorical variables])
                       preprocessor = ColumnTransformer(
                           transformers=[('num', numeric_transformer, numeric_features),
                               ('cat', categorical_transformer, categorical_features)])
                       # Combine preprocessing with the model
                       rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                                      ('classifier', RandomForestClassifier())])
                       svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                                       ('classifier', SVC())])
                       # Model Training
                       rf_pipeline.fit(X_train, y_train)
                       svm_pipeline.fit(X_train, y_train)

                    448     Touchpad Artificial Intelligence (Ver. 3.0)-XI
   445   446   447   448   449   450   451   452   453   454   455