import numpy as np
import pandas as pd
# Visual libraries
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
from matplotlib.ticker import PercentFormatter


import seaborn as sns


apps = pd.read_csv('Google-Playstore.csv')
apps.head()


# The highest rating_count has Category Music or Games
# The category which has the most apps is the most popular (having the most rating_count)
# There is correlation between rating_count and the rating it self


apps.columns

Index(['app_name', 'app_id', 'category', 'rating', 'rating_count', 'installs',
       'minimum_installs', 'maximum_installs', 'free', 'price', 'currency',
       'size', 'minimum_android', 'developer_id', 'developer_website',
       'developer_email', 'released', 'last_updated', 'content_rating',
       'privacy_policy', 'ad_supported', 'in_app_purchases', 'editors_choice',
       'scraped_time'],
      dtype='object')


apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312944 entries, 0 to 2312943
Data columns (total 24 columns):
 #   Column             Dtype  
---  ------             -----  
 0   app_name           object 
 1   app_id             object 
 2   category           object 
 3   rating             float64
 4   rating_count       float64
 5   installs           object 
 6   minimum_installs   float64
 7   maximum_installs   int64  
 8   free               bool   
 9   price              float64
 10  currency           object 
 11  size               object 
 12  minimum_android    object 
 13  developer_id       object 
 14  developer_website  object 
 15  developer_email    object 
 16  released           object 
 17  last_updated       object 
 18  content_rating     object 
 19  privacy_policy     object 
 20  ad_supported       bool   
 21  in_app_purchases   bool   
 22  editors_choice     bool   
 23  scraped_time       object 
dtypes: bool(4), float64(4), int64(1), object(15)
memory usage: 361.8+ MB


#Some columns have incorrect data types: Released, Size. Released should be a datetime. Size is probably rendered as string because each size contains the letter 'M' to indicate megabytes. These issues will be added to the list too.


#Below we can see how much numeric columns we have


numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = apps.select_dtypes(include=numerics)
len(numeric_df.columns)

5


 apps.describe()


apps.shape

(2312944, 24)


# Display in normal notation instead of scientific
apps.describe().apply(lambda s: s.apply('{0:.5f}'.format))


#Looks like all numerical columns looks realistic, like rating should be between 0 and 5.
#But the maximum value for price is 400$ which is a bit suspicious. We will dig into that later.


# How much missing data, NaN


apps.isna().sum().sort_values(ascending=False)

developer_website    760835
privacy_policy       420953
released              71053
rating                22883
rating_count          22883
minimum_android        6530
size                    196
currency                135
installs                107
minimum_installs        107
developer_id             33
developer_email          31
app_name                  2
app_id                    0
price                     0
free                      0
maximum_installs          0
last_updated              0
content_rating            0
category                  0
ad_supported              0
in_app_purchases          0
editors_choice            0
scraped_time              0
dtype: int64


Nan_percent=apps.isna().sum().sort_values(ascending=False)/len(apps)
Nan_percent

developer_website    3.289466e-01
privacy_policy       1.819988e-01
released             3.071972e-02
rating               9.893452e-03
rating_count         9.893452e-03
minimum_android      2.823242e-03
size                 8.474049e-05
currency             5.836717e-05
installs             4.626139e-05
minimum_installs     4.626139e-05
developer_id         1.426753e-05
developer_email      1.340283e-05
app_name             8.646988e-07
app_id               0.000000e+00
price                0.000000e+00
free                 0.000000e+00
maximum_installs     0.000000e+00
last_updated         0.000000e+00
content_rating       0.000000e+00
category             0.000000e+00
ad_supported         0.000000e+00
in_app_purchases     0.000000e+00
editors_choice       0.000000e+00
scraped_time         0.000000e+00
dtype: float64


type(Nan_percent)

pandas.core.series.Series


Nan_percent[Nan_percent !=0 ]

Developer Website    3.289466e-01
Privacy Policy       1.819988e-01
Released             3.071972e-02
Rating               9.893452e-03
Rating Count         9.893452e-03
Minimum Android      2.823242e-03
Size                 8.474049e-05
Currency             5.836717e-05
Installs             4.626139e-05
Minimum Installs     4.626139e-05
Developer Id         1.426753e-05
Developer Email      1.340283e-05
App Name             8.646988e-07
dtype: float64


Nan_percent[Nan_percent !=0 ].plot(kind='barh')

<AxesSubplot:>


# We can drop columns 'Developer Website' and 'Privacy Policy'


apps['category'].value_counts()

Education                  241090
Music & Audio              154906
Tools                      143988
Business                   143771
Entertainment              138276
Lifestyle                  118331
Books & Reference          116728
Personalization             89210
Health & Fitness            83510
Productivity                79698
Shopping                    75256
Food & Drink                73927
Travel & Local              67288
Finance                     65466
Arcade                      53792
Puzzle                      51168
Casual                      50813
Communication               48167
Sports                      47483
Social                      44734
News & Magazines            42807
Photography                 35552
Medical                     32065
Action                      27555
Maps & Navigation           26722
Simulation                  23282
Adventure                   23203
Educational                 21308
Art & Design                18539
Auto & Vehicles             18280
House & Home                14369
Video Players & Editors     14015
Events                      12841
Trivia                      11795
Beauty                      11772
Board                       10588
Racing                      10362
Role Playing                10034
Word                         8630
Strategy                     8526
Card                         8179
Weather                      7246
Dating                       6524
Libraries & Demo             5198
Casino                       5076
Music                        4202
Parenting                    3810
Comics                       2862
Name: category, dtype: int64


#Some categories of interest like Music and Eduction are given with different labels:
#there are both 'Music & Audio' and 'Music' labels as well as 'Education' and 'Educational' for education.
#They should be merged together to represent a single category.
#Later, we will subset for the top 8 columns after finishing cleaning.


#Before we further explore, let's deal with the issues we highlighted. Here is the final list:

# Issues List For the Dataset:
## Missing values in several cols: Rating, rating count, Installs, minimum and maximum installs, currency and more
## Drop these columns: App ID, minimum android version, developer ID, website and email, privacy policy link.
## Incorrect data types for release data and size
## Music and education is represented by different labels
## Drop unnecessary categories


apps.rename(lambda x: x.lower().strip().replace(' ', '_'), 
            axis='columns', inplace=True)


apps.columns

Index(['app_name', 'app_id', 'category', 'rating', 'rating_count', 'installs',
       'minimum_installs', 'maximum_installs', 'free', 'price', 'currency',
       'size', 'minimum_android', 'developer_id', 'developer_website',
       'developer_email', 'released', 'last_updated', 'content_rating',
       'privacy_policy', 'ad_supported', 'in_app_purchases', 'editors_choice',
       'scraped_time'],
      dtype='object')


to_drop = [
    'app_id', 'minimum_android', 
    'developer_id', 'developer_website', 'developer_email', 'privacy_policy', 
]


apps.drop(to_drop, axis='columns', inplace=True)


#to check
assert apps.columns.all() not in to_drop


apps.columns

Index(['app_name', 'category', 'rating', 'rating_count', 'installs',
       'minimum_installs', 'maximum_installs', 'free', 'price', 'currency',
       'size', 'released', 'last_updated', 'content_rating', 'ad_supported',
       'in_app_purchases', 'editors_choice', 'scraped_time'],
      dtype='object')


len(apps.columns)

18


apps.category

0              Adventure
1                  Tools
2           Productivity
3          Communication
4                  Tools
               ...      
2312939     Role Playing
2312940        Education
2312941        Education
2312942    Music & Audio
2312943           Trivia
Name: category, Length: 2312944, dtype: object


Categories = apps.category.unique()
len(Categories)

48


# Collapse 'Music' and 'Music & Audio' into 'Music'
apps['category'] = apps['category'].str.replace('Music & Audio', 'Music')


# Collapse 'Educational' and 'Education' into 'Education'
apps['category'] = apps['category'].str.replace('Educational', 'Education')


apps.category

0              Adventure
1                  Tools
2           Productivity
3          Communication
4                  Tools
               ...      
2312939     Role Playing
2312940        Education
2312941        Education
2312942            Music
2312943           Trivia
Name: category, Length: 2312944, dtype: object


Categories

array(['Adventure', 'Tools', 'Productivity', 'Communication', 'Social',
       'Libraries & Demo', 'Lifestyle', 'Personalization', 'Racing',
       'Maps & Navigation', 'Travel & Local', 'Food & Drink',
       'Books & Reference', 'Medical', 'Puzzle', 'Entertainment',
       'Arcade', 'Auto & Vehicles', 'Photography', 'Health & Fitness',
       'Education', 'Shopping', 'Board', 'Music & Audio', 'Sports',
       'Beauty', 'Business', 'Educational', 'Finance', 'News & Magazines',
       'Casual', 'Art & Design', 'House & Home', 'Card', 'Events',
       'Trivia', 'Weather', 'Strategy', 'Word', 'Video Players & Editors',
       'Action', 'Simulation', 'Music', 'Dating', 'Role Playing',
       'Casino', 'Comics', 'Parenting'], dtype=object)


apps_by_categories = apps.category.value_counts()
apps_by_categories

Education                  262398
Music                      159108
Tools                      143988
Business                   143771
Entertainment              138276
Lifestyle                  118331
Books & Reference          116728
Personalization             89210
Health & Fitness            83510
Productivity                79698
Shopping                    75256
Food & Drink                73927
Travel & Local              67288
Finance                     65466
Arcade                      53792
Puzzle                      51168
Casual                      50813
Communication               48167
Sports                      47483
Social                      44734
News & Magazines            42807
Photography                 35552
Medical                     32065
Action                      27555
Maps & Navigation           26722
Simulation                  23282
Adventure                   23203
Art & Design                18539
Auto & Vehicles             18280
House & Home                14369
Video Players & Editors     14015
Events                      12841
Trivia                      11795
Beauty                      11772
Board                       10588
Racing                      10362
Role Playing                10034
Word                         8630
Strategy                     8526
Card                         8179
Weather                      7246
Dating                       6524
Libraries & Demo             5198
Casino                       5076
Parenting                    3810
Comics                       2862
Name: category, dtype: int64


top_8_list = apps_by_categories.head(8)

#made a top8 categories list to use in future


top_8_list = top_8_list.reset_index()


top_8_list = top_8_list.loc[:, 'index']


top_8_list

0            Education
1                Music
2                Tools
3             Business
4        Entertainment
5            Lifestyle
6    Books & Reference
7      Personalization
Name: index, dtype: object


top = apps[apps['category'].isin(top_8_list)].reset_index(drop=True)
top


top.shape

(1171810, 18)


top.category.value_counts()

Education            262398
Music                159108
Tools                143988
Business             143771
Entertainment        138276
Lifestyle            118331
Books & Reference    116728
Personalization       89210
Name: category, dtype: int64


# Specifying the datetime format significantly reduces conversion time
apps['released'] = pd.to_datetime(apps['released'], format='%b %d, %Y',
                                 infer_datetime_format=True, errors='coerce')


apps['released'].dtype

dtype('<M8[ns]')


# Strip of all text and convert to numeric
apps['size'] = pd.to_numeric(apps['size'].str.replace(r'[a-zA-Z]+', ''), 
                             errors='coerce')

C:\Users\Monika\AppData\Local\Temp/ipykernel_9020/1235571128.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
  apps['size'] = pd.to_numeric(apps['size'].str.replace(r'[a-zA-Z]+', ''),


apps['size'].dtype == 'float64'

True


# Category
# Rating
# Installs
# App Name
# Price


Popular_categories = apps_by_categories[apps_by_categories>=30000]
Unpopular_categories = apps_by_categories[apps_by_categories<30000]
len(Popular_categories)
len(Unpopular_categories)

23


sns.distplot(apps_by_categories)

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='Category', ylabel='Density'>


fig, ax = plt.subplots()
# Plot a histogram
ax.hist(apps['rating'], bins=20)
# Label
ax.set(title='A Histogram of App ratings',
       xlabel='Rating out of 5.0',
       ylabel='Count')
plt.show()


# From this histogram we can see that we have a big number of apps which have no rating. We should eliminate them


fig, ax = plt.subplots()

# Subset for ratings over 0
over_0 = apps[apps['rating'] > 0]['rating']

# Plot a histogram
ax.hist(over_0, bins=15)

# Label
ax.set(title='A Histogram of App ratings',
       xlabel='Rating out of 5.0',
       ylabel='Count')

plt.show();


#Histogram shows that majority of the apps are rated between ~3.8 and 4.8. Also surprising to see so many 5-star ratings.


Categories = apps.category.unique()
len(Categories)

46


#Remembet that our top 8 categories are named top_8_list and all new df just with these top categories is called top


fig, ax = plt.subplots()

# Plot a normalized countplot
top['category'].value_counts(normalize=True).plot.barh()

# Label
ax.set(title='Proportion of 8 Categories',
       xlabel='Proportion', ylabel='')

plt.show();

fig.savefig('Proportion of 8 Categories')


#Looks like educational apps make up more than one fifth of the data.

#It would be ideal if we had the install_count given as integers.


def string_to_numeric(data):
    data=data.replace(',','')
    data=data[:-1]
    return int(data)


top.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171810 entries, 0 to 1171809
Data columns (total 18 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   app_name          1171810 non-null  object        
 1   category          1171810 non-null  object        
 2   rating            1164547 non-null  float64       
 3   rating_count      1164547 non-null  float64       
 4   installs          1171802 non-null  object        
 5   minimum_installs  1171802 non-null  float64       
 6   maximum_installs  1171810 non-null  int64         
 7   free              1171810 non-null  bool          
 8   price             1171810 non-null  float64       
 9   currency          1171792 non-null  object        
 10  size              1142535 non-null  float64       
 11  released          1144512 non-null  datetime64[ns]
 12  last_updated      1171810 non-null  object        
 13  content_rating    1171810 non-null  object        
 14  ad_supported      1171810 non-null  bool          
 15  in_app_purchases  1171810 non-null  bool          
 16  editors_choice    1171810 non-null  bool          
 17  scraped_time      1171810 non-null  object        
dtypes: bool(4), datetime64[ns](1), float64(5), int64(1), object(7)
memory usage: 129.6+ MB


top['installs'] = (top.installs.str.replace('+', ''))
top['installs'] = (top.installs.str.replace(',', ''))
top['installs']=top['installs'].dropna()
top['installs']

C:\Users\Monika\AppData\Local\Temp/ipykernel_9020/2440583364.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  top['installs'] = (top.installs.str.replace('+', ''))

0           5000
1            100
2            500
3          50000
4             50
           ...  
1171805        5
1171806     1000
1171807      100
1171808      100
1171809     1000
Name: installs, Length: 1171810, dtype: object


# top['installs']=top['installs'].apply(string_to_numeric)


fig, ax = plt.subplots()

# Create a normalized countplot
top.installs.value_counts(normalize=True).plot.barh()

ax.set(title='Proportion of install categories',
       xlabel='Proportion', ylabel='')

plt.show();


#the plot shows that the vast majority of installs are between 10 and 10k installs.
#Maybe, we could get a better insight if we plotted rating_count. 
#The number of ratings is given as exact figures and logically, they are positively related to install count.
#Before plotting, let's get the 5-number-summary of rating_count (ivertinimu kiekis):


with pd.option_context('float_format', '{:f}'.format):
    print(top.rating_count.describe())

count    1164547.000000
mean        1210.463652
std        62804.289848
min            0.000000
25%            0.000000
50%            6.000000
75%           39.000000
max     35128398.000000
Name: rating_count, dtype: float64


# from this data we can see that we need to focus to the data where rating_count <39


fig, ax = plt.subplots()

# Choose apps with ratings <39
percentile_75 = top[top['rating_count'] < 39]

ax.hist(percentile_75['rating_count'], bins=15)

ax.set(title='Histogram of Rating Count',
       xlabel='Ratings', ylabel='Count')

plt.show();


#This histogram tells us that about a half of the apps have no more than 5 ratings.
#This shows how competetive the mobile market is.
#Only a small proportion of apps can go as popular as the ones with thousands of ratings.


over_mln = top[top['rating_count'] > 1e6]
over_mln.shape

(169, 18)


#Out of the initial 1 million apps, only 169 have over 1 million ratings


fig, ax = plt.subplots()
ax = over_mln.category.value_counts().plot.barh()

ax.set(title='Comparison of Rating Count of Most Popular Apps by Category',
       ylabel='Rating Count')

plt.show();
fig.savefig('Comparison of Rating Count of Most Popular Apps by Category')


#Not surprisingly, more than 50 of the apps belong to Tools which probably include most popular everyday apps.


# Create a mask for paid apps
is_paid = top['price'] != 0

with pd.option_context('float_format', '{:f}'.format):
    print(top[is_paid]['price'].describe())

count   26116.000000
mean        4.882284
std        16.846465
min         0.194824
25%         0.990000
50%         1.990000
75%         3.990000
max       399.990000
Name: price, dtype: float64


#From this data we can see that 75% of apps are paid around 4$. We will focus on apps paid less than 10$


fig, ax = plt.subplots()

# Subset for apps that cost less than 10$
less_10 = top[(top['price'] > 0) & (top['price'] < 10)]

ax.hist(less_10['price'], bins=15)

ax.set(title='PMF of Price of Paid Apps',
       xlabel='Price ($)', ylabel='Count')

plt.show();


# we can see that the most paid apps are paid around 1$


Which category have the most expensive apps


fig, ax = plt.subplots()

sns.boxplot(x='category', y='price', data=top[is_paid])

ax.set_yscale('log')

ax.set(title='Comparison of Price Between Categories',
       xlabel='', ylabel='Price ($)')

# Rotate xtick labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=60)

# Set custom yticklabels
y_ticks = [0.3, 0.5, 1, 3, 5, 10, 30, 100, 300]
ax.set_yticklabels(y_ticks)

plt.show();

C:\Users\Monika\AppData\Local\Temp/ipykernel_9020/3100383024.py:15: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(y_ticks)


#  Looking at medium price for each, Business seems to be the winner closely followed by Books $ Reference.
# Now, let's see if more ratings mean higher ratings. 
# Again, we will only look at apps with ratings fewer than 100k and exclude the ones with no ratings.

0                       Ampere Battery Info
1                                   GROW.me
2                     The Everyday Calendar
3          Neon 3d Iron Tech Keyboard Theme
4                   All in one shopping app
                         ...               
1171805    STMIK Mercusuar - Aditya Rachman
1171806         Lero TOEFL Recorder + Timer
1171807                          ORU Online
1171808                      Data Structure
1171809                         Devi Suktam
Name: app_name, Length: 1171810, dtype: object


fig, ax = plt.subplots()

# Filter out undesired apps
majority_rated = top[(top['rating_count'] > 0) & (top['rating_count'] < 1e5)]

# Jitter the ratings
rating_jittered = majority_rated['rating'] + np.random.normal(0, 0.12, len(majority_rated))
# Jitter the number of ratings
count_jittered = majority_rated['rating_count'] + np.random.normal(0, 1, len(majority_rated))

ax.plot(count_jittered, rating_jittered, 
        marker='o', linestyle='none', markersize=1, alpha=0.05)

# Use log scale
ax.set_xscale('log')
# Set custom tick labels
x_tick_labels = [1, 10, 50, 100, 200, 500, 1000, 5000, 10000, 50000, 100000]
ax.set_xticklabels(x_tick_labels)

# Label
ax.set(title='Rating Count vs. Rating',
       xlabel='Rating Count', 
       ylabel='Rating')

plt.show();

C:\Users\Monika\AppData\Local\Temp/ipykernel_7160/2713553702.py:18: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(x_tick_labels)


sns.heatmap(top.corr(), annot=True, 
            linewidths=3, center=0, cmap='rainbow');

results_path = 'results.png'
#print(results_path)
plt.savefig(results_path)


#We could confirm our earlier notion that there is a non-linear positive relationship
#between rating and rating count with a coefficient of  r=0.019 .


top.to_json('top_file_tableau.json')


top.to_csv('top_file_tableau.csv')

	App Name	App Id	Category	Rating	Rating Count	Installs	Minimum Installs	Maximum Installs	Free	...	Developer Website	Developer Email	Released	Last Updated	Content Rating	Privacy Policy	Ad Supported	In App Purchases	Editors Choice	Scraped Time
0	Gakondo	com.ishakwe.gakondo	Adventure	0.0	0.0	10+	10.0	15	True	...	https://beniyizibyose.tk/#/	jean21101999@gmail.com	Feb 26, 2020	Feb 26, 2020	Everyone	https://beniyizibyose.tk/projects/	False	False	False	2021-06-15 20:19:35
1	Ampere Battery Info	com.webserveis.batteryinfo	Tools	4.4	64.0	5,000+	5000.0	7662	True	...	https://webserveis.netlify.app/	webserveis@gmail.com	May 21, 2020	May 06, 2021	Everyone	https://dev4phones.wordpress.com/licencia-de-uso/	True	False	False	2021-06-15 20:19:35
2	Vibook	com.doantiepvien.crm	Productivity	0.0	0.0	50+	50.0	58	True	...	NaN	vnacrewit@gmail.com	Aug 9, 2019	Aug 19, 2019	Everyone	https://www.vietnamairlines.com/vn/en/terms-an...	False	False	False	2021-06-15 20:19:35
3	Smart City Trichy Public Service Vehicles 17UC...	cst.stJoseph.ug17ucs548	Communication	5.0	5.0	10+	10.0	19	True	...	http://www.climatesmarttech.com/	climatesmarttech2@gmail.com	Sep 10, 2018	Oct 13, 2018	Everyone	NaN	True	False	False	2021-06-15 20:19:35
4	GROW.me	com.horodyski.grower	Tools	0.0	0.0	100+	100.0	478	True	...	http://www.horodyski.com.pl	rmilekhorodyski@gmail.com	Feb 21, 2020	Nov 12, 2018	Everyone	http://www.horodyski.com.pl	False	False	False	2021-06-15 20:19:35

	rating	rating_count	minimum_installs	maximum_installs	price
count	2.290061e+06	2.290061e+06	2.312837e+06	2.312944e+06	2.312944e+06
mean	2.203152e+00	2.864839e+03	1.834452e+05	3.202017e+05	1.034992e-01
std	2.106223e+00	2.121626e+05	1.513144e+07	2.355495e+07	2.633127e+00
min	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00
25%	0.000000e+00	0.000000e+00	5.000000e+01	8.400000e+01	0.000000e+00
50%	2.900000e+00	6.000000e+00	5.000000e+02	6.950000e+02	0.000000e+00
75%	4.300000e+00	4.200000e+01	5.000000e+03	7.354000e+03	0.000000e+00
max	5.000000e+00	1.385576e+08	1.000000e+10	1.205763e+10	4.000000e+02

	rating	rating_count	minimum_installs	maximum_installs	price
count	2290061.00000	2290061.00000	2312837.00000	2312944.00000	2312944.00000
mean	2.20315	2864.83888	183445.21396	320201.71314	0.10350
std	2.10622	212162.57104	15131439.05973	23554954.88655	2.63313
min	0.00000	0.00000	0.00000	0.00000	0.00000
25%	0.00000	0.00000	50.00000	84.00000	0.00000
50%	2.90000	6.00000	500.00000	695.00000	0.00000
75%	4.30000	42.00000	5000.00000	7354.00000	0.00000
max	5.00000	138557570.00000	10000000000.00000	12057627016.00000	400.00000

Import of the data / Reading file¶

Hypotheses¶

Data clearing and exploring¶

Basic exploring¶

Data cleaning¶

Convert all columns¶

Drop unnecessary columns¶

Collapse multiple categories into one¶

Convert released to datetime¶

Convert size to float¶

Data analysis¶

Save data for Tableau¶