import numpy as np
import pandas as pd
# Visual libraries
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import seaborn as sns
apps = pd.read_csv('Google-Playstore.csv')
apps.head()
App Name | App Id | Category | Rating | Rating Count | Installs | Minimum Installs | Maximum Installs | Free | Price | ... | Developer Website | Developer Email | Released | Last Updated | Content Rating | Privacy Policy | Ad Supported | In App Purchases | Editors Choice | Scraped Time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Gakondo | com.ishakwe.gakondo | Adventure | 0.0 | 0.0 | 10+ | 10.0 | 15 | True | 0.0 | ... | https://beniyizibyose.tk/#/ | jean21101999@gmail.com | Feb 26, 2020 | Feb 26, 2020 | Everyone | https://beniyizibyose.tk/projects/ | False | False | False | 2021-06-15 20:19:35 |
1 | Ampere Battery Info | com.webserveis.batteryinfo | Tools | 4.4 | 64.0 | 5,000+ | 5000.0 | 7662 | True | 0.0 | ... | https://webserveis.netlify.app/ | webserveis@gmail.com | May 21, 2020 | May 06, 2021 | Everyone | https://dev4phones.wordpress.com/licencia-de-uso/ | True | False | False | 2021-06-15 20:19:35 |
2 | Vibook | com.doantiepvien.crm | Productivity | 0.0 | 0.0 | 50+ | 50.0 | 58 | True | 0.0 | ... | NaN | vnacrewit@gmail.com | Aug 9, 2019 | Aug 19, 2019 | Everyone | https://www.vietnamairlines.com/vn/en/terms-an... | False | False | False | 2021-06-15 20:19:35 |
3 | Smart City Trichy Public Service Vehicles 17UC... | cst.stJoseph.ug17ucs548 | Communication | 5.0 | 5.0 | 10+ | 10.0 | 19 | True | 0.0 | ... | http://www.climatesmarttech.com/ | climatesmarttech2@gmail.com | Sep 10, 2018 | Oct 13, 2018 | Everyone | NaN | True | False | False | 2021-06-15 20:19:35 |
4 | GROW.me | com.horodyski.grower | Tools | 0.0 | 0.0 | 100+ | 100.0 | 478 | True | 0.0 | ... | http://www.horodyski.com.pl | rmilekhorodyski@gmail.com | Feb 21, 2020 | Nov 12, 2018 | Everyone | http://www.horodyski.com.pl | False | False | False | 2021-06-15 20:19:35 |
5 rows × 24 columns
# The highest rating_count has Category Music or Games
# The category which has the most apps is the most popular (having the most rating_count)
# There is correlation between rating_count and the rating it self
apps.columns
Index(['app_name', 'app_id', 'category', 'rating', 'rating_count', 'installs', 'minimum_installs', 'maximum_installs', 'free', 'price', 'currency', 'size', 'minimum_android', 'developer_id', 'developer_website', 'developer_email', 'released', 'last_updated', 'content_rating', 'privacy_policy', 'ad_supported', 'in_app_purchases', 'editors_choice', 'scraped_time'], dtype='object')
apps.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2312944 entries, 0 to 2312943 Data columns (total 24 columns): # Column Dtype --- ------ ----- 0 app_name object 1 app_id object 2 category object 3 rating float64 4 rating_count float64 5 installs object 6 minimum_installs float64 7 maximum_installs int64 8 free bool 9 price float64 10 currency object 11 size object 12 minimum_android object 13 developer_id object 14 developer_website object 15 developer_email object 16 released object 17 last_updated object 18 content_rating object 19 privacy_policy object 20 ad_supported bool 21 in_app_purchases bool 22 editors_choice bool 23 scraped_time object dtypes: bool(4), float64(4), int64(1), object(15) memory usage: 361.8+ MB
#Some columns have incorrect data types: Released, Size. Released should be a datetime. Size is probably rendered as string because each size contains the letter 'M' to indicate megabytes. These issues will be added to the list too.
#Below we can see how much numeric columns we have
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = apps.select_dtypes(include=numerics)
len(numeric_df.columns)
5
apps.describe()
rating | rating_count | minimum_installs | maximum_installs | price | |
---|---|---|---|---|---|
count | 2.290061e+06 | 2.290061e+06 | 2.312837e+06 | 2.312944e+06 | 2.312944e+06 |
mean | 2.203152e+00 | 2.864839e+03 | 1.834452e+05 | 3.202017e+05 | 1.034992e-01 |
std | 2.106223e+00 | 2.121626e+05 | 1.513144e+07 | 2.355495e+07 | 2.633127e+00 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
25% | 0.000000e+00 | 0.000000e+00 | 5.000000e+01 | 8.400000e+01 | 0.000000e+00 |
50% | 2.900000e+00 | 6.000000e+00 | 5.000000e+02 | 6.950000e+02 | 0.000000e+00 |
75% | 4.300000e+00 | 4.200000e+01 | 5.000000e+03 | 7.354000e+03 | 0.000000e+00 |
max | 5.000000e+00 | 1.385576e+08 | 1.000000e+10 | 1.205763e+10 | 4.000000e+02 |
apps.shape
(2312944, 24)
# Display in normal notation instead of scientific
apps.describe().apply(lambda s: s.apply('{0:.5f}'.format))
rating | rating_count | minimum_installs | maximum_installs | price | |
---|---|---|---|---|---|
count | 2290061.00000 | 2290061.00000 | 2312837.00000 | 2312944.00000 | 2312944.00000 |
mean | 2.20315 | 2864.83888 | 183445.21396 | 320201.71314 | 0.10350 |
std | 2.10622 | 212162.57104 | 15131439.05973 | 23554954.88655 | 2.63313 |
min | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
25% | 0.00000 | 0.00000 | 50.00000 | 84.00000 | 0.00000 |
50% | 2.90000 | 6.00000 | 500.00000 | 695.00000 | 0.00000 |
75% | 4.30000 | 42.00000 | 5000.00000 | 7354.00000 | 0.00000 |
max | 5.00000 | 138557570.00000 | 10000000000.00000 | 12057627016.00000 | 400.00000 |
#Looks like all numerical columns looks realistic, like rating should be between 0 and 5.
#But the maximum value for price is 400$ which is a bit suspicious. We will dig into that later.
# How much missing data, NaN
apps.isna().sum().sort_values(ascending=False)
developer_website 760835 privacy_policy 420953 released 71053 rating 22883 rating_count 22883 minimum_android 6530 size 196 currency 135 installs 107 minimum_installs 107 developer_id 33 developer_email 31 app_name 2 app_id 0 price 0 free 0 maximum_installs 0 last_updated 0 content_rating 0 category 0 ad_supported 0 in_app_purchases 0 editors_choice 0 scraped_time 0 dtype: int64
Nan_percent=apps.isna().sum().sort_values(ascending=False)/len(apps)
Nan_percent
developer_website 3.289466e-01 privacy_policy 1.819988e-01 released 3.071972e-02 rating 9.893452e-03 rating_count 9.893452e-03 minimum_android 2.823242e-03 size 8.474049e-05 currency 5.836717e-05 installs 4.626139e-05 minimum_installs 4.626139e-05 developer_id 1.426753e-05 developer_email 1.340283e-05 app_name 8.646988e-07 app_id 0.000000e+00 price 0.000000e+00 free 0.000000e+00 maximum_installs 0.000000e+00 last_updated 0.000000e+00 content_rating 0.000000e+00 category 0.000000e+00 ad_supported 0.000000e+00 in_app_purchases 0.000000e+00 editors_choice 0.000000e+00 scraped_time 0.000000e+00 dtype: float64
type(Nan_percent)
pandas.core.series.Series
Nan_percent[Nan_percent !=0 ]
Developer Website 3.289466e-01 Privacy Policy 1.819988e-01 Released 3.071972e-02 Rating 9.893452e-03 Rating Count 9.893452e-03 Minimum Android 2.823242e-03 Size 8.474049e-05 Currency 5.836717e-05 Installs 4.626139e-05 Minimum Installs 4.626139e-05 Developer Id 1.426753e-05 Developer Email 1.340283e-05 App Name 8.646988e-07 dtype: float64
Nan_percent[Nan_percent !=0 ].plot(kind='barh')
<AxesSubplot:>
# We can drop columns 'Developer Website' and 'Privacy Policy'
apps['category'].value_counts()
Education 241090 Music & Audio 154906 Tools 143988 Business 143771 Entertainment 138276 Lifestyle 118331 Books & Reference 116728 Personalization 89210 Health & Fitness 83510 Productivity 79698 Shopping 75256 Food & Drink 73927 Travel & Local 67288 Finance 65466 Arcade 53792 Puzzle 51168 Casual 50813 Communication 48167 Sports 47483 Social 44734 News & Magazines 42807 Photography 35552 Medical 32065 Action 27555 Maps & Navigation 26722 Simulation 23282 Adventure 23203 Educational 21308 Art & Design 18539 Auto & Vehicles 18280 House & Home 14369 Video Players & Editors 14015 Events 12841 Trivia 11795 Beauty 11772 Board 10588 Racing 10362 Role Playing 10034 Word 8630 Strategy 8526 Card 8179 Weather 7246 Dating 6524 Libraries & Demo 5198 Casino 5076 Music 4202 Parenting 3810 Comics 2862 Name: category, dtype: int64
#Some categories of interest like Music and Eduction are given with different labels:
#there are both 'Music & Audio' and 'Music' labels as well as 'Education' and 'Educational' for education.
#They should be merged together to represent a single category.
#Later, we will subset for the top 8 columns after finishing cleaning.
#Before we further explore, let's deal with the issues we highlighted. Here is the final list:
# Issues List For the Dataset:
## Missing values in several cols: Rating, rating count, Installs, minimum and maximum installs, currency and more
## Drop these columns: App ID, minimum android version, developer ID, website and email, privacy policy link.
## Incorrect data types for release data and size
## Music and education is represented by different labels
## Drop unnecessary categories
apps.rename(lambda x: x.lower().strip().replace(' ', '_'),
axis='columns', inplace=True)
apps.columns
Index(['app_name', 'app_id', 'category', 'rating', 'rating_count', 'installs', 'minimum_installs', 'maximum_installs', 'free', 'price', 'currency', 'size', 'minimum_android', 'developer_id', 'developer_website', 'developer_email', 'released', 'last_updated', 'content_rating', 'privacy_policy', 'ad_supported', 'in_app_purchases', 'editors_choice', 'scraped_time'], dtype='object')
to_drop = [
'app_id', 'minimum_android',
'developer_id', 'developer_website', 'developer_email', 'privacy_policy',
]
apps.drop(to_drop, axis='columns', inplace=True)
#to check
assert apps.columns.all() not in to_drop
apps.columns
Index(['app_name', 'category', 'rating', 'rating_count', 'installs', 'minimum_installs', 'maximum_installs', 'free', 'price', 'currency', 'size', 'released', 'last_updated', 'content_rating', 'ad_supported', 'in_app_purchases', 'editors_choice', 'scraped_time'], dtype='object')
len(apps.columns)
18
apps.category
0 Adventure 1 Tools 2 Productivity 3 Communication 4 Tools ... 2312939 Role Playing 2312940 Education 2312941 Education 2312942 Music & Audio 2312943 Trivia Name: category, Length: 2312944, dtype: object
Categories = apps.category.unique()
len(Categories)
48
# Collapse 'Music' and 'Music & Audio' into 'Music'
apps['category'] = apps['category'].str.replace('Music & Audio', 'Music')
# Collapse 'Educational' and 'Education' into 'Education'
apps['category'] = apps['category'].str.replace('Educational', 'Education')
apps.category
0 Adventure 1 Tools 2 Productivity 3 Communication 4 Tools ... 2312939 Role Playing 2312940 Education 2312941 Education 2312942 Music 2312943 Trivia Name: category, Length: 2312944, dtype: object
Categories
array(['Adventure', 'Tools', 'Productivity', 'Communication', 'Social', 'Libraries & Demo', 'Lifestyle', 'Personalization', 'Racing', 'Maps & Navigation', 'Travel & Local', 'Food & Drink', 'Books & Reference', 'Medical', 'Puzzle', 'Entertainment', 'Arcade', 'Auto & Vehicles', 'Photography', 'Health & Fitness', 'Education', 'Shopping', 'Board', 'Music & Audio', 'Sports', 'Beauty', 'Business', 'Educational', 'Finance', 'News & Magazines', 'Casual', 'Art & Design', 'House & Home', 'Card', 'Events', 'Trivia', 'Weather', 'Strategy', 'Word', 'Video Players & Editors', 'Action', 'Simulation', 'Music', 'Dating', 'Role Playing', 'Casino', 'Comics', 'Parenting'], dtype=object)
apps_by_categories = apps.category.value_counts()
apps_by_categories
Education 262398 Music 159108 Tools 143988 Business 143771 Entertainment 138276 Lifestyle 118331 Books & Reference 116728 Personalization 89210 Health & Fitness 83510 Productivity 79698 Shopping 75256 Food & Drink 73927 Travel & Local 67288 Finance 65466 Arcade 53792 Puzzle 51168 Casual 50813 Communication 48167 Sports 47483 Social 44734 News & Magazines 42807 Photography 35552 Medical 32065 Action 27555 Maps & Navigation 26722 Simulation 23282 Adventure 23203 Art & Design 18539 Auto & Vehicles 18280 House & Home 14369 Video Players & Editors 14015 Events 12841 Trivia 11795 Beauty 11772 Board 10588 Racing 10362 Role Playing 10034 Word 8630 Strategy 8526 Card 8179 Weather 7246 Dating 6524 Libraries & Demo 5198 Casino 5076 Parenting 3810 Comics 2862 Name: category, dtype: int64
top_8_list = apps_by_categories.head(8)
#made a top8 categories list to use in future
top_8_list = top_8_list.reset_index()
top_8_list = top_8_list.loc[:, 'index']
top_8_list
0 Education 1 Music 2 Tools 3 Business 4 Entertainment 5 Lifestyle 6 Books & Reference 7 Personalization Name: index, dtype: object
top = apps[apps['category'].isin(top_8_list)].reset_index(drop=True)
top
app_name | category | rating | rating_count | installs | minimum_installs | maximum_installs | free | price | currency | size | released | last_updated | content_rating | ad_supported | in_app_purchases | editors_choice | scraped_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Ampere Battery Info | Tools | 4.4 | 64.0 | 5,000+ | 5000.0 | 7662 | True | 0.0 | USD | 2.9M | May 21, 2020 | May 06, 2021 | Everyone | True | False | False | 2021-06-15 20:19:35 |
1 | GROW.me | Tools | 0.0 | 0.0 | 100+ | 100.0 | 478 | True | 0.0 | USD | 6.2M | Feb 21, 2020 | Nov 12, 2018 | Everyone | False | False | False | 2021-06-15 20:19:35 |
2 | The Everyday Calendar | Lifestyle | 2.0 | 39.0 | 500+ | 500.0 | 702 | True | 0.0 | USD | 16M | Jun 21, 2019 | Jun 21, 2019 | Everyone | False | False | False | 2021-06-15 20:19:35 |
3 | Neon 3d Iron Tech Keyboard Theme | Personalization | 4.7 | 820.0 | 50,000+ | 50000.0 | 62433 | True | 0.0 | USD | 3.5M | Sep 22, 2019 | Oct 07, 2020 | Everyone | True | False | False | 2021-06-15 20:19:35 |
4 | All in one shopping app | Tools | 5.0 | 6.0 | 50+ | 50.0 | 62 | True | 0.0 | USD | 2.0M | Apr 24, 2019 | May 05, 2019 | Everyone | False | False | False | 2021-06-15 20:19:36 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1171805 | STMIK Mercusuar - Aditya Rachman | Education | 0.0 | 0.0 | 5+ | 5.0 | 7 | True | 0.0 | USD | 6.6M | Jan 15, 2020 | Jan 15, 2020 | Everyone | False | False | False | 2021-06-16 12:59:18 |
1171806 | Lero TOEFL Recorder + Timer | Education | 3.4 | 17.0 | 1,000+ | 1000.0 | 1980 | True | 0.0 | USD | 10M | May 22, 2018 | Dec 14, 2018 | Everyone | True | False | False | 2021-06-16 12:59:18 |
1171807 | ORU Online | Education | 0.0 | 0.0 | 100+ | 100.0 | 430 | True | 0.0 | USD | 44M | Jan 17, 2018 | Feb 02, 2018 | Everyone | False | False | False | 2021-06-16 12:59:19 |
1171808 | Data Structure | Education | 0.0 | 0.0 | 100+ | 100.0 | 202 | True | 0.0 | USD | 29M | Aug 19, 2018 | Aug 19, 2018 | Everyone | False | False | False | 2021-06-16 12:59:19 |
1171809 | Devi Suktam | Music | 3.5 | 8.0 | 1,000+ | 1000.0 | 2635 | True | 0.0 | USD | 10M | Aug 1, 2016 | May 05, 2021 | Everyone | True | False | False | 2021-06-16 12:59:19 |
1171810 rows × 18 columns
top.shape
(1171810, 18)
top.category.value_counts()
Education 262398 Music 159108 Tools 143988 Business 143771 Entertainment 138276 Lifestyle 118331 Books & Reference 116728 Personalization 89210 Name: category, dtype: int64
# Specifying the datetime format significantly reduces conversion time
apps['released'] = pd.to_datetime(apps['released'], format='%b %d, %Y',
infer_datetime_format=True, errors='coerce')
apps['released'].dtype
dtype('<M8[ns]')
# Strip of all text and convert to numeric
apps['size'] = pd.to_numeric(apps['size'].str.replace(r'[a-zA-Z]+', ''),
errors='coerce')
C:\Users\Monika\AppData\Local\Temp/ipykernel_9020/1235571128.py:2: FutureWarning: The default value of regex will change from True to False in a future version. apps['size'] = pd.to_numeric(apps['size'].str.replace(r'[a-zA-Z]+', ''),
apps['size'].dtype == 'float64'
True
# Category
# Rating
# Installs
# App Name
# Price
Popular_categories = apps_by_categories[apps_by_categories>=30000]
Unpopular_categories = apps_by_categories[apps_by_categories<30000]
len(Popular_categories)
len(Unpopular_categories)
23
sns.distplot(apps_by_categories)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Category', ylabel='Density'>
fig, ax = plt.subplots()
# Plot a histogram
ax.hist(apps['rating'], bins=20)
# Label
ax.set(title='A Histogram of App ratings',
xlabel='Rating out of 5.0',
ylabel='Count')
plt.show()
# From this histogram we can see that we have a big number of apps which have no rating. We should eliminate them
fig, ax = plt.subplots()
# Subset for ratings over 0
over_0 = apps[apps['rating'] > 0]['rating']
# Plot a histogram
ax.hist(over_0, bins=15)
# Label
ax.set(title='A Histogram of App ratings',
xlabel='Rating out of 5.0',
ylabel='Count')
plt.show();
#Histogram shows that majority of the apps are rated between ~3.8 and 4.8. Also surprising to see so many 5-star ratings.
Categories = apps.category.unique()
len(Categories)
46
#Remembet that our top 8 categories are named top_8_list and all new df just with these top categories is called top
fig, ax = plt.subplots()
# Plot a normalized countplot
top['category'].value_counts(normalize=True).plot.barh()
# Label
ax.set(title='Proportion of 8 Categories',
xlabel='Proportion', ylabel='')
plt.show();
fig.savefig('Proportion of 8 Categories')
#Looks like educational apps make up more than one fifth of the data.
#It would be ideal if we had the install_count given as integers.
def string_to_numeric(data):
data=data.replace(',','')
data=data[:-1]
return int(data)
top.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1171810 entries, 0 to 1171809 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 app_name 1171810 non-null object 1 category 1171810 non-null object 2 rating 1164547 non-null float64 3 rating_count 1164547 non-null float64 4 installs 1171802 non-null object 5 minimum_installs 1171802 non-null float64 6 maximum_installs 1171810 non-null int64 7 free 1171810 non-null bool 8 price 1171810 non-null float64 9 currency 1171792 non-null object 10 size 1142535 non-null float64 11 released 1144512 non-null datetime64[ns] 12 last_updated 1171810 non-null object 13 content_rating 1171810 non-null object 14 ad_supported 1171810 non-null bool 15 in_app_purchases 1171810 non-null bool 16 editors_choice 1171810 non-null bool 17 scraped_time 1171810 non-null object dtypes: bool(4), datetime64[ns](1), float64(5), int64(1), object(7) memory usage: 129.6+ MB
top['installs'] = (top.installs.str.replace('+', ''))
top['installs'] = (top.installs.str.replace(',', ''))
top['installs']=top['installs'].dropna()
top['installs']
C:\Users\Monika\AppData\Local\Temp/ipykernel_9020/2440583364.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. top['installs'] = (top.installs.str.replace('+', ''))
0 5000 1 100 2 500 3 50000 4 50 ... 1171805 5 1171806 1000 1171807 100 1171808 100 1171809 1000 Name: installs, Length: 1171810, dtype: object
# top['installs']=top['installs'].apply(string_to_numeric)
fig, ax = plt.subplots()
# Create a normalized countplot
top.installs.value_counts(normalize=True).plot.barh()
ax.set(title='Proportion of install categories',
xlabel='Proportion', ylabel='')
plt.show();
#the plot shows that the vast majority of installs are between 10 and 10k installs.
#Maybe, we could get a better insight if we plotted rating_count.
#The number of ratings is given as exact figures and logically, they are positively related to install count.
#Before plotting, let's get the 5-number-summary of rating_count (ivertinimu kiekis):
with pd.option_context('float_format', '{:f}'.format):
print(top.rating_count.describe())
count 1164547.000000 mean 1210.463652 std 62804.289848 min 0.000000 25% 0.000000 50% 6.000000 75% 39.000000 max 35128398.000000 Name: rating_count, dtype: float64
# from this data we can see that we need to focus to the data where rating_count <39
fig, ax = plt.subplots()
# Choose apps with ratings <39
percentile_75 = top[top['rating_count'] < 39]
ax.hist(percentile_75['rating_count'], bins=15)
ax.set(title='Histogram of Rating Count',
xlabel='Ratings', ylabel='Count')
plt.show();
#This histogram tells us that about a half of the apps have no more than 5 ratings.
#This shows how competetive the mobile market is.
#Only a small proportion of apps can go as popular as the ones with thousands of ratings.
over_mln = top[top['rating_count'] > 1e6]
over_mln.shape
(169, 18)
#Out of the initial 1 million apps, only 169 have over 1 million ratings
fig, ax = plt.subplots()
ax = over_mln.category.value_counts().plot.barh()
ax.set(title='Comparison of Rating Count of Most Popular Apps by Category',
ylabel='Rating Count')
plt.show();
fig.savefig('Comparison of Rating Count of Most Popular Apps by Category')
#Not surprisingly, more than 50 of the apps belong to Tools which probably include most popular everyday apps.
# Create a mask for paid apps
is_paid = top['price'] != 0
with pd.option_context('float_format', '{:f}'.format):
print(top[is_paid]['price'].describe())
count 26116.000000 mean 4.882284 std 16.846465 min 0.194824 25% 0.990000 50% 1.990000 75% 3.990000 max 399.990000 Name: price, dtype: float64
#From this data we can see that 75% of apps are paid around 4$. We will focus on apps paid less than 10$
fig, ax = plt.subplots()
# Subset for apps that cost less than 10$
less_10 = top[(top['price'] > 0) & (top['price'] < 10)]
ax.hist(less_10['price'], bins=15)
ax.set(title='PMF of Price of Paid Apps',
xlabel='Price ($)', ylabel='Count')
plt.show();
# we can see that the most paid apps are paid around 1$
Which category have the most expensive apps
fig, ax = plt.subplots()
sns.boxplot(x='category', y='price', data=top[is_paid])
ax.set_yscale('log')
ax.set(title='Comparison of Price Between Categories',
xlabel='', ylabel='Price ($)')
# Rotate xtick labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=60)
# Set custom yticklabels
y_ticks = [0.3, 0.5, 1, 3, 5, 10, 30, 100, 300]
ax.set_yticklabels(y_ticks)
plt.show();
C:\Users\Monika\AppData\Local\Temp/ipykernel_9020/3100383024.py:15: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(y_ticks)
# Looking at medium price for each, Business seems to be the winner closely followed by Books $ Reference.
# Now, let's see if more ratings mean higher ratings.
# Again, we will only look at apps with ratings fewer than 100k and exclude the ones with no ratings.
0 Ampere Battery Info 1 GROW.me 2 The Everyday Calendar 3 Neon 3d Iron Tech Keyboard Theme 4 All in one shopping app ... 1171805 STMIK Mercusuar - Aditya Rachman 1171806 Lero TOEFL Recorder + Timer 1171807 ORU Online 1171808 Data Structure 1171809 Devi Suktam Name: app_name, Length: 1171810, dtype: object
fig, ax = plt.subplots()
# Filter out undesired apps
majority_rated = top[(top['rating_count'] > 0) & (top['rating_count'] < 1e5)]
# Jitter the ratings
rating_jittered = majority_rated['rating'] + np.random.normal(0, 0.12, len(majority_rated))
# Jitter the number of ratings
count_jittered = majority_rated['rating_count'] + np.random.normal(0, 1, len(majority_rated))
ax.plot(count_jittered, rating_jittered,
marker='o', linestyle='none', markersize=1, alpha=0.05)
# Use log scale
ax.set_xscale('log')
# Set custom tick labels
x_tick_labels = [1, 10, 50, 100, 200, 500, 1000, 5000, 10000, 50000, 100000]
ax.set_xticklabels(x_tick_labels)
# Label
ax.set(title='Rating Count vs. Rating',
xlabel='Rating Count',
ylabel='Rating')
plt.show();
C:\Users\Monika\AppData\Local\Temp/ipykernel_7160/2713553702.py:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(x_tick_labels)
sns.heatmap(top.corr(), annot=True,
linewidths=3, center=0, cmap='rainbow');
results_path = 'results.png'
#print(results_path)
plt.savefig(results_path)
#We could confirm our earlier notion that there is a non-linear positive relationship
#between rating and rating count with a coefficient of r=0.019 .
top.to_json('top_file_tableau.json')
top.to_csv('top_file_tableau.csv')