-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchoose_features.py
39 lines (25 loc) · 1.07 KB
/
choose_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
from scipy.stats import chi2_contingency
# Load the data
df = pd.read_csv('train.csv')
# Create a cross-tabulation of 'Embarked' and 'Survived'
contingency_table = pd.crosstab(df['Embarked'], df['Survived'])
# Perform the chi-square test for independence
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Print the p-value
print(p) # 1.769922284120912e-06, so there might be some correleation
# Get unique values
unique_values = df['Embarked'].unique()
print("Unique values in 'Embarked':", unique_values)
# Get number of unique values
num_unique_values = df['Embarked'].nunique()
print("Number of unique values in 'Embarked':", num_unique_values)
#print(df["Embarked"].isnull().sum())
# Convert 'Embarked' to a categorical type
df['Embarked'] = df['Embarked'].astype('category')
print(df["Embarked"])
# # Create a dictionary that maps category codes to categories
# code_to_category_dict = dict(enumerate(df['Embarked'].cat.categories))
# print(code_to_category_dict)
# # Convert categories to codes
# df['Embarked'] = df['Embarked'].cat.codes