import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
US_emp = pd.read_csv("US_emp.csv", index_col=0)
US_emp.head()
Prefix | FirstName | LastName | BirthDate | MaritalStatus | Gender | EmailAddress | AnnualIncome | TotalChildren | EducationLevel | Occupation | HomeOwner | State | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CustomerKey | |||||||||||||
11000 | MR. | JON | YANG | 4/8/1966 | M | M | jon24@adventure-works.com | 90000 | 2 | Bachelors | Professional | Y | Alabama |
11001 | MR. | EUGENE | HUANG | 5/14/1965 | S | M | eugene10@adventure-works.com | 60000 | 3 | Bachelors | Professional | N | Alaska |
11002 | MR. | RUBEN | TORRES | 8/12/1965 | M | M | ruben35@adventure-works.com | 60000 | 3 | Bachelors | Professional | Y | Arizona |
11003 | MS. | CHRISTY | ZHU | 2/15/1968 | S | F | christy12@adventure-works.com | 70000 | 0 | Bachelors | Professional | N | Arkansas |
11004 | MRS. | ELIZABETH | JOHNSON | 8/8/1968 | S | F | elizabeth5@adventure-works.com | 80000 | 5 | Bachelors | Professional | Y | California |
average=round(US_emp.groupby('EducationLevel')[['AnnualIncome']].mean(),2)
print(average)
AnnualIncome EducationLevel Bachelors 64396.50 Graduate Degree 66150.40 High School 48963.28 Partial College 55090.62 Partial High School 39575.56
total=round(US_emp['AnnualIncome'].mean(),2)
print(total)
57269.12
percentage=round(US_emp.groupby('EducationLevel')[['AnnualIncome']].mean() / US_emp['AnnualIncome'].mean() * 100,0)
print(percentage)
AnnualIncome EducationLevel Bachelors 112.0 Graduate Degree 116.0 High School 85.0 Partial College 96.0 Partial High School 69.0
df = percentage.rename(columns={'AnnualIncome':'AnnualIncome_Percentage'})
df.sort_values('AnnualIncome_Percentage',ascending = False)
AnnualIncome_Percentage | |
---|---|
EducationLevel | |
Graduate Degree | 116.0 |
Bachelors | 112.0 |
Partial College | 96.0 |
High School | 85.0 |
Partial High School | 69.0 |
df[['AnnualIncome_Percentage']].plot.bar()
plt.title('Pajamų priklausomybė nuo išsilavinimo')
plt.xlabel('Education Level')
plt.ylabel('Annual Income Percentage')
plt.rcParams['figure.figsize'] = (15,3)
average1=round(US_emp.groupby('Occupation')[['AnnualIncome']].mean(),2)
print(average1)
AnnualIncome Occupation Clerical 30696.05 Management 92218.53 Manual 16472.59 Professional 74166.67 Skilled Manual 51732.95
percentage1=round(US_emp.groupby('Occupation')[['AnnualIncome']].mean() / US_emp["AnnualIncome"].mean() * 100,0)
print(percentage1)
AnnualIncome Occupation Clerical 54.0 Management 161.0 Manual 29.0 Professional 130.0 Skilled Manual 90.0
df1 = percentage1.rename(columns={'AnnualIncome':'AnnualIncome_Percentage'})
df1.sort_values('AnnualIncome_Percentage',ascending = False)
AnnualIncome_Percentage | |
---|---|
Occupation | |
Management | 161.0 |
Professional | 130.0 |
Skilled Manual | 90.0 |
Clerical | 54.0 |
Manual | 29.0 |
df[['AnnualIncome_Percentage']].plot.bar()
plt.title('Pajamų priklausomybė nuo veiklos pobūdžio')
plt.xlabel('Occupation')
plt.ylabel('Annual Income Percentage')
plt.rcParams['figure.figsize'] = (15,3)
US_emp['If_has_children']=US_emp['TotalChildren']>0
US_emp['If_has_children']=US_emp['If_has_children'].map({True:'Y',False:'N'})
US_emp
Prefix | FirstName | LastName | BirthDate | MaritalStatus | Gender | EmailAddress | AnnualIncome | TotalChildren | EducationLevel | Occupation | HomeOwner | State | If_has_children | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CustomerKey | ||||||||||||||
11000 | MR. | JON | YANG | 4/8/1966 | M | M | jon24@adventure-works.com | 90000 | 2 | Bachelors | Professional | Y | Alabama | Y |
11001 | MR. | EUGENE | HUANG | 5/14/1965 | S | M | eugene10@adventure-works.com | 60000 | 3 | Bachelors | Professional | N | Alaska | Y |
11002 | MR. | RUBEN | TORRES | 8/12/1965 | M | M | ruben35@adventure-works.com | 60000 | 3 | Bachelors | Professional | Y | Arizona | Y |
11003 | MS. | CHRISTY | ZHU | 2/15/1968 | S | F | christy12@adventure-works.com | 70000 | 0 | Bachelors | Professional | N | Arkansas | N |
11004 | MRS. | ELIZABETH | JOHNSON | 8/8/1968 | S | F | elizabeth5@adventure-works.com | 80000 | 5 | Bachelors | Professional | Y | California | Y |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
29479 | MR. | TOMMY | TANG | 7/4/1958 | M | M | tommy2@adventure-works.com | 30000 | 1 | Graduate Degree | Clerical | Y | Utah | Y |
29480 | MRS. | NINA | RAJI | 11/10/1960 | S | F | nina21@adventure-works.com | 30000 | 3 | Graduate Degree | Clerical | Y | Vermont | Y |
29481 | MR. | IVAN | SURI | 1/5/1960 | S | M | ivan0@adventure-works.com | 30000 | 3 | Graduate Degree | Clerical | N | Virginia | Y |
29482 | MR. | CLAYTON | ZHANG | 3/5/1959 | M | M | clayton0@adventure-works.com | 30000 | 3 | Bachelors | Clerical | Y | Washington | Y |
29483 | MR. | J�SUS | NAVARRO | 12/8/1959 | M | M | j�sus9@adventure-works.com | 30000 | 0 | Bachelors | Clerical | Y | Oregon | N |
18148 rows × 14 columns
df2 = US_emp [['MaritalStatus', 'AnnualIncome','If_has_children']]
df2
MaritalStatus | AnnualIncome | If_has_children | |
---|---|---|---|
CustomerKey | |||
11000 | M | 90000 | Y |
11001 | S | 60000 | Y |
11002 | M | 60000 | Y |
11003 | S | 70000 | N |
11004 | S | 80000 | Y |
... | ... | ... | ... |
29479 | M | 30000 | Y |
29480 | S | 30000 | Y |
29481 | S | 30000 | Y |
29482 | M | 30000 | Y |
29483 | M | 30000 | N |
18148 rows × 3 columns
family = round(df2.groupby(['If_has_children', 'MaritalStatus'])['AnnualIncome'].mean(),2)
family
If_has_children MaritalStatus N M 56464.53 S 48910.56 Y M 61147.60 S 56861.37 Name: AnnualIncome, dtype: float64
children =round(df2.groupby('If_has_children')[['AnnualIncome']].mean() / df2["AnnualIncome"].mean() * 100,0)
children.sort_values('AnnualIncome',ascending = False)
AnnualIncome | |
---|---|
If_has_children | |
Y | 104.0 |
N | 90.0 |
children.plot.pie(y='AnnualIncome',
figsize = (7,7),
fontsize = 12,
labeldistance = 0.3,
explode = (0.05,0),
colors = ['steelblue','yellow'],
legend = False,
title = 'Vidutinės pajamos pagal namų ūkyje kartu gyvenančius vaikus')
<AxesSubplot:title={'center':'Vidutinės pajamos pagal namų ūkyje kartu gyvenančius vaikus'}, ylabel='AnnualIncome'>
married=round(df2.groupby('MaritalStatus')[['AnnualIncome']].mean() / df2["AnnualIncome"].mean() * 100,0)
married.sort_values('AnnualIncome',ascending = False)
AnnualIncome | |
---|---|
MaritalStatus | |
M | 105.0 |
S | 94.0 |
married.plot.pie(y='AnnualIncome',
figsize = (7,7),
fontsize = 12,
labeldistance = 0.3,
explode = (0.05,0),
colors = ['steelblue','yellow'],
legend = False,
title = 'Vidutinės pajamos pagal šeimyninę padėtį')
<AxesSubplot:title={'center':'Vidutinės pajamos pagal šeimyninę padėtį'}, ylabel='AnnualIncome'>
gender =US_emp.groupby('Gender')[['AnnualIncome']].mean() / US_emp['AnnualIncome'].mean() * 100
gender.sort_values('AnnualIncome',ascending = False)
AnnualIncome | |
---|---|
Gender | |
F | 100.026708 |
M | 99.973597 |
state_average=round(US_emp.groupby('State')[['AnnualIncome']].mean(),2)
state_average = state_average.sort_values('AnnualIncome',ascending = False).head(5)
state_average
AnnualIncome | |
---|---|
State | |
Colorado | 59338.84 |
North Carolina | 58815.43 |
Virginia | 58539.94 |
Utah | 58181.82 |
California | 58071.63 |
state_average[['AnnualIncome']].plot.bar()
plt.title('Valstijos kuriose darbuotojai gauna didžiausias vidutines pajamas (TOP 5)')
plt.xlabel('State')
plt.ylabel('Annual Income')
plt.rcParams['figure.figsize'] = (5,15)
def income_group(X):
if X <= 35000:
return 'Lower_class'
if X > 35000 and X <= 60000:
return 'Middle_class'
if X > 60000:
return 'Upper_class'
return np.NaN
US_emp['AnnualIncome']=US_emp['AnnualIncome'].map(income_group)
df3 = US_emp[US_emp['HomeOwner'] == 'Y']
df3
Prefix | FirstName | LastName | BirthDate | MaritalStatus | Gender | EmailAddress | AnnualIncome | TotalChildren | EducationLevel | Occupation | HomeOwner | State | If_has_children | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CustomerKey | ||||||||||||||
11000 | MR. | JON | YANG | 4/8/1966 | M | M | jon24@adventure-works.com | Upper_class | 2 | Bachelors | Professional | Y | Alabama | Y |
11002 | MR. | RUBEN | TORRES | 8/12/1965 | M | M | ruben35@adventure-works.com | Middle_class | 3 | Bachelors | Professional | Y | Arizona | Y |
11004 | MRS. | ELIZABETH | JOHNSON | 8/8/1968 | S | F | elizabeth5@adventure-works.com | Upper_class | 5 | Bachelors | Professional | Y | California | Y |
11005 | MR. | JULIO | RUIZ | 8/5/1965 | S | M | julio1@adventure-works.com | Upper_class | 0 | Bachelors | Professional | Y | Colorado | N |
11007 | MR. | MARCO | MEHTA | 5/9/1964 | M | M | marco14@adventure-works.com | Middle_class | 3 | Bachelors | Professional | Y | Connecticut | Y |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
29478 | MR. | DARREN | CARLSON | 5/25/1959 | S | M | darren41@adventure-works.com | Lower_class | 3 | Graduate Degree | Clerical | Y | Texas | Y |
29479 | MR. | TOMMY | TANG | 7/4/1958 | M | M | tommy2@adventure-works.com | Lower_class | 1 | Graduate Degree | Clerical | Y | Utah | Y |
29480 | MRS. | NINA | RAJI | 11/10/1960 | S | F | nina21@adventure-works.com | Lower_class | 3 | Graduate Degree | Clerical | Y | Vermont | Y |
29482 | MR. | CLAYTON | ZHANG | 3/5/1959 | M | M | clayton0@adventure-works.com | Lower_class | 3 | Bachelors | Clerical | Y | Washington | Y |
29483 | MR. | J�SUS | NAVARRO | 12/8/1959 | M | M | j�sus9@adventure-works.com | Lower_class | 0 | Bachelors | Clerical | Y | Oregon | N |
12260 rows × 14 columns
owner=df3.groupby('AnnualIncome')[['HomeOwner']].count()
owner.sort_values('HomeOwner',ascending = False)
HomeOwner | |
---|---|
AnnualIncome | |
Middle_class | 4655 |
Upper_class | 4605 |
Lower_class | 3000 |
owner.plot.pie(y='HomeOwner',
figsize = (7,7),
fontsize = 12,
labeldistance = 0.3,
explode = (0,0.05,0),
colors = ['skyblue','steelblue','yellow'],
legend = False,
autopct = '%1.1f%%',
title = 'Asmenys turintys NT pagal vidutinių pajamų grupes')
<AxesSubplot:title={'center':'Asmenys turintys NT pagal vidutinių pajamų grupes'}, ylabel='HomeOwner'>