import pandas as pd 
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt

df_csv = pd.read_csv('sample_csv.csv') 

df_csv

df_json = pd.read_json('sample_json.json') 

df_json

df_xml = pd.read_xml('sample_xml.xml') 

df_xml

sns.heatmap(df_csv.isnull(), cbar=False) 

df_drop = df_csv.dropna() 

df_impute = df_csv.copy() 
numeric_cols = df_impute.select_dtypes(include='number').columns 
for col in numeric_cols: 
    df_impute[col] = df_impute[col].fillna(df_impute[col].mean()) 

df_impute_clean = df_csv.copy() 

numeric_cols = df_impute_clean.select_dtypes(include='number').columns 
df_impute_clean[numeric_cols] = df_impute_clean[numeric_cols].fillna(df_impute_clean[numeric_cols].mean())

col = 'Total Profit' 
Q1 = df_impute[col].quantile(0.25) 
Q3 = df_impute[col].quantile(0.75) 

IQR = Q3 - Q1 
lower = Q1 - 1.5 * IQR 
upper = Q3 + 1.5 * IQR 

outliers = df_impute[(df_impute[col] < lower) | (df_impute[col] > upper)] 
print("Number of outliers:", len(outliers))

df_no_outliers = df_impute[
    (df_impute[col] >= lower) & (df_impute[col] <= upper)
]
outliers = df_no_outliers[(df_no_outliers[col] < lower) | (df_no_outliers[col] > upper)] 
print("Number of outliers after removing:", len(outliers))


sns.boxplot(x=df_csv[col]) 
plt.title('Before Cleaning') 
plt.show() 


sns.boxplot(x=df_impute[col]) 
plt.title('After Imputation (Still with Outliers)')
plt.show()

sns.boxplot(x=df_no_outliers[col]) 
plt.title('After Imputation (Still with Outliers)')
plt.show()