import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import project_module as pm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier


import pymysql
from sqlalchemy import create_engine


db_connection_str = "mysql+pymysql://root:***@localhost:3306/project_social_media"


db_connection = create_engine(db_connection_str)


results = pd.read_sql("SELECT * FROM survey_results", con=db_connection)


results


results["interests"].replace("Lifestlye", "Lifestyle", inplace=True)
results


results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           1000 non-null   int64 
 1   gender        1000 non-null   object
 2   time_spent    1000 non-null   int64 
 3   platform      1000 non-null   object
 4   interests     1000 non-null   object
 5   location      1000 non-null   object
 6   demographics  1000 non-null   object
 7   profession    1000 non-null   object
 8   income        1000 non-null   int64 
 9   in_debt       1000 non-null   object
 10  home_owner    1000 non-null   object
 11  car_owner     1000 non-null   object
dtypes: int64(3), object(9)
memory usage: 93.9+ KB


pm.updating_data(results["in_debt"], ["False", "True"], [0, 1])


pm.updating_data(results["home_owner"], ["False", "True"], [0, 1])


pm.updating_data(results["car_owner"], ["False", "True"], [0, 1])


pm.updating_data(results["platform"], ["Facebook", "Instagram", "YouTube"], [1, 2, 3])


results


results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           1000 non-null   int64 
 1   gender        1000 non-null   object
 2   time_spent    1000 non-null   int64 
 3   platform      1000 non-null   int64 
 4   interests     1000 non-null   object
 5   location      1000 non-null   object
 6   demographics  1000 non-null   object
 7   profession    1000 non-null   object
 8   income        1000 non-null   int64 
 9   in_debt       1000 non-null   int64 
 10  home_owner    1000 non-null   int64 
 11  car_owner     1000 non-null   int64 
dtypes: int64(7), object(5)
memory usage: 93.9+ KB


for_corr = results[["age", "time_spent", "platform", "income", "in_debt", "home_owner", "car_owner"]]
for_corr


for_corr_cleaned = pm.data_cleaning(for_corr)
for_corr_cleaned


for_corr_cleaned.corr()


sns.heatmap(for_corr_cleaned.corr())
plt.show()


fig, ax = plt.subplots(1, 2, figsize=(8, 3), sharey=True)

fig.suptitle("Data distribution")

ax[0].scatter(results["age"], results["time_spent"], c='purple', s=5)
ax[0].set_xlabel("age", loc="center")
ax[0].set_ylabel("time spent", loc="center")

ax[1].scatter(results["income"], results["time_spent"], c='blue', s=5)
ax[1].set_xlabel("income", loc="center")

plt.tight_layout()
plt.show()


X = results[["age"]]
y = results["time_spent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, y_train)
print("Counting predictions:")
print(model.predict(X_test))
print("Counting R^2 score:")
print(model.score(X_test, y_test))

Counting predictions:
[1. 9. 8. 7. 4. 8. 2. 4. 4. 4. 1. 1. 4. 4. 5. 3. 1. 4. 5. 9. 4. 7. 2. 8.
 6. 4. 2. 5. 5. 8. 2. 5. 2. 4. 9. 6. 6. 2. 2. 5. 8. 4. 7. 8. 4. 1. 5. 4.
 2. 2. 2. 3. 4. 4. 6. 5. 3. 2. 2. 8. 7. 4. 8. 5. 9. 5. 9. 3. 4. 4. 4. 9.
 8. 3. 2. 3. 9. 5. 2. 6. 2. 6. 4. 4. 6. 8. 5. 6. 5. 5. 4. 3. 2. 1. 4. 8.
 2. 5. 2. 6. 4. 9. 6. 5. 8. 2. 4. 9. 8. 6. 4. 2. 8. 6. 4. 4. 4. 4. 9. 2.
 3. 8. 5. 1. 2. 4. 5. 7. 7. 5. 8. 9. 4. 8. 1. 5. 1. 4. 3. 8. 5. 3. 8. 4.
 2. 2. 4. 3. 3. 8. 1. 1. 9. 5. 2. 8. 2. 9. 5. 4. 4. 9. 2. 4. 2. 3. 3. 9.
 3. 4. 7. 1. 8. 5. 3. 9. 9. 5. 4. 4. 5. 4. 5. 3. 5. 3. 4. 6. 3. 4. 4. 8.
 2. 8. 3. 3. 9. 9. 9. 3.]
Counting R^2 score:
-0.8958256485921554


results.plot.scatter(x='age', y='time_spent', color="red", title="Real data & KNN Regression prediction")
plt.scatter(X_test, model.predict(X_test))
plt.show()


X = results[["age"]]
y = results["time_spent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = KNeighborsClassifier(n_neighbors=100)
model.fit(X_train, y_train)
print("Prediction:")
print(model.predict(X_test))
print("R^2 score:")
print(model.score(X_test, y_test))
results.plot.scatter(x='age', y='time_spent', color="red", title="Real data & KNN Classifier prediction")
plt.scatter(X_test, model.predict(X_test))
plt.show()

Prediction:
[8 8 1 7 4 9 1 3 2 1 3 5 2 2 5 1 8 2 1 3 2 2 2 9 3 2 4 9 2 1 5 3 8 7 5 1 7
 6 3 1 2 5 5 8 8 4 3 7 6 8 5 1 5 1 8 3 2 5 5 6 2 6 2 8 9 3 8 3 6 3 7 3 2 6
 7 5 6 9 2 3 2 3 9 7 4 8 1 3 7 1 3 2 6 6 7 4 5 2 5 7 7 4 2 1 1 7 1 2 5 2 2
 2 2 3 3 4 1 7 3 4 4 4 7 1 5 2 3 2 5 5 6 2 2 2 1 2 3 2 1 3 7 7 8 4 1 9 5 1
 3 3 1 2 3 7 9 1 8 3 6 5 2 8 3 1 9 8 1 7 1 1 2 2 2 8 6 2 2 3 7 2 3 3 3 2 7
 3 5 3 2 2 3 6 3 1 3 5 1 6 6 6]
R^2 score:
0.105


X = results[["income"]]
y = results["time_spent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = KNeighborsClassifier(n_neighbors=100)
model.fit(X_train, y_train)
print("Prediction:")
print(model.predict(X_test))
print("R^2 score:")
print(model.score(X_test, y_test))
results.plot.scatter(x='income', y='time_spent', color="red", title="Real data & KNN Classifier prediction")
plt.scatter(X_test, model.predict(X_test))
plt.show()

Prediction:
[5 5 3 3 3 5 5 4 5 5 4 2 4 9 2 3 4 4 2 4 5 5 5 6 5 3 5 4 5 3 5 2 7 9 4 5 2
 3 2 2 9 3 5 6 2 3 5 4 9 8 4 5 5 3 2 5 5 5 6 3 5 5 3 5 5 3 3 7 4 3 3 1 3 5
 7 5 4 5 4 5 7 7 3 4 4 4 3 2 3 9 1 4 3 5 3 9 4 5 5 3 6 3 5 3 3 2 5 3 6 9 3
 5 6 5 2 5 3 2 5 5 5 2 4 6 6 4 5 3 3 9 9 5 4 4 3 4 3 1 1 4 3 5 3 5 2 4 3 3
 5 2 5 5 1 5 2 4 2 5 5 7 8 6 3 5 6 2 7 6 7 2 4 3 3 3 7 9 4 3 3 2 5 2 5 3 4
 4 3 5 9 3 9 5 6 4 2 9 2 3 5 3]
R^2 score:
0.085


X = results[["age"]]
y = results["platform"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = KNeighborsClassifier(n_neighbors=100)
model.fit(X_train, y_train)
print("Prediction:")
print(model.predict(X_test))
print("R^2 score:")
print(model.score(X_test, y_test))
results.plot.scatter(x="age", y="platform", color="red", title="Real data & KNN Classifier prediction")
plt.scatter(X_test, model.predict(X_test))
plt.show()

Prediction:
[2 2 2 3 2 2 2 2 2 3 3 3 2 2 2 3 3 2 2 2 2 2 3 3 3 2 2 2 2 2 3 2 3 3 2 2 2
 2 3 1 2 2 3 1 3 3 2 3 2 2 3 2 2 2 1 2 2 3 3 2 2 3 3 2 2 3 2 2 3 2 2 2 3 2
 2 2 3 2 3 1 2 2 2 2 2 2 2 3 3 2 3 2 2 3 2 3 2 3 2 2 2 2 2 2 2 2 2 2 1 1 2
 2 2 3 2 2 1 2 3 2 2 3 2 2 2 2 3 3 2 3 2 3 3 3 2 3 2 2 2 2 3 2 2 2 2 2 3 2
 2 3 2 3 3 2 1 2 3 3 2 1 2 2 3 2 1 2 3 2 2 2 2 2 2 2 3 2 3 2 2 3 3 3 2 2 2
 2 1 2 2 2 3 2 3 2 2 3 2 2 3 2]
R^2 score:
0.355


X = results[["income"]]
y = results["platform"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = KNeighborsClassifier(n_neighbors=100)
model.fit(X_train, y_train)
print("Prediction:")
print(model.predict(X_test))
print("R^2 score:")
print(model.score(X_test, y_test))
results.plot.scatter(x='income', y='platform', color="red", title="Real data & K-neighbour Classifier prediction")
plt.scatter(X_test, model.predict(X_test))
plt.show()

Prediction:
[3 2 2 2 1 1 1 2 1 2 3 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 2 3 2 2 3 2 2 2 2 2 3
 2 2 1 1 1 2 3 1 3 2 3 2 3 2 3 3 2 1 3 2 2 2 2 1 1 2 2 3 2 3 1 2 2 1 2 2 2
 2 3 1 2 2 2 2 1 2 1 2 2 3 1 2 3 1 1 2 2 3 3 2 2 2 2 2 2 2 3 2 2 1 2 2 2 2
 2 2 1 2 3 2 1 2 2 2 3 2 3 3 2 3 2 3 3 2 2 2 1 2 2 2 2 2 3 3 3 2 2 2 3 2 1
 2 2 2 3 2 2 2 2 1 2 1 1 1 2 2 2 1 1 2 2 3 2 3 2 1 3 1 2 3 2 2 2 1 3 3 2 2
 3 2 2 1 2 2 2 2 2 1 2 3 2 2 1]
R^2 score:
0.295

	age	gender	time_spent	platform	interests	location	demographics	profession	income	in_debt	home_owner	car_owner
0	56	male	3	Instagram	Sports	United Kingdom	Urban	Software Engineer	19774	True	False	False
1	46	female	2	Facebook	Travel	United Kingdom	Urban	Student	10564	True	True	True
2	32	male	8	Instagram	Sports	Australia	Sub_Urban	Marketer Manager	13258	False	False	False
3	60	non-binary	5	Instagram	Travel	United Kingdom	Urban	Student	12500	False	True	False
4	25	male	1	Instagram	Lifestlye	Australia	Urban	Software Engineer	14566	False	True	True
...	...	...	...	...	...	...	...	...	...	...	...	...
995	22	female	8	Instagram	Lifestlye	United Kingdom	Rural	Marketer Manager	18536	False	True	False
996	40	non-binary	6	YouTube	Travel	United Kingdom	Rural	Software Engineer	12711	True	False	False
997	27	non-binary	5	YouTube	Travel	United Kingdom	Rural	Student	17595	True	False	True
998	61	female	4	YouTube	Sports	Australia	Sub_Urban	Marketer Manager	16273	True	True	False
999	19	female	8	YouTube	Travel	Australia	Rural	Student	16284	False	True	False

	age	gender	time_spent	platform	interests	location	demographics	profession	income	in_debt	home_owner	car_owner
0	56	male	3	Instagram	Sports	United Kingdom	Urban	Software Engineer	19774	True	False	False
1	46	female	2	Facebook	Travel	United Kingdom	Urban	Student	10564	True	True	True
2	32	male	8	Instagram	Sports	Australia	Sub_Urban	Marketer Manager	13258	False	False	False
3	60	non-binary	5	Instagram	Travel	United Kingdom	Urban	Student	12500	False	True	False
4	25	male	1	Instagram	Lifestyle	Australia	Urban	Software Engineer	14566	False	True	True
...	...	...	...	...	...	...	...	...	...	...	...	...
995	22	female	8	Instagram	Lifestyle	United Kingdom	Rural	Marketer Manager	18536	False	True	False
996	40	non-binary	6	YouTube	Travel	United Kingdom	Rural	Software Engineer	12711	True	False	False
997	27	non-binary	5	YouTube	Travel	United Kingdom	Rural	Student	17595	True	False	True
998	61	female	4	YouTube	Sports	Australia	Sub_Urban	Marketer Manager	16273	True	True	False
999	19	female	8	YouTube	Travel	Australia	Rural	Student	16284	False	True	False

	age	gender	time_spent	platform	interests	location	demographics	profession	income	in_debt	home_owner	car_owner
0	56	male	3	2	Sports	United Kingdom	Urban	Software Engineer	19774	1	0	0
1	46	female	2	1	Travel	United Kingdom	Urban	Student	10564	1	1	1
2	32	male	8	2	Sports	Australia	Sub_Urban	Marketer Manager	13258	0	0	0
3	60	non-binary	5	2	Travel	United Kingdom	Urban	Student	12500	0	1	0
4	25	male	1	2	Lifestyle	Australia	Urban	Software Engineer	14566	0	1	1
...	...	...	...	...	...	...	...	...	...	...	...	...
995	22	female	8	2	Lifestyle	United Kingdom	Rural	Marketer Manager	18536	0	1	0
996	40	non-binary	6	3	Travel	United Kingdom	Rural	Software Engineer	12711	1	0	0
997	27	non-binary	5	3	Travel	United Kingdom	Rural	Student	17595	1	0	1
998	61	female	4	3	Sports	Australia	Sub_Urban	Marketer Manager	16273	1	1	0
999	19	female	8	3	Travel	Australia	Rural	Student	16284	0	1	0

	age	time_spent	platform	income	in_debt	home_owner	car_owner
0	56	3	2	19774	1	0	0
1	46	2	1	10564	1	1	1
2	32	8	2	13258	0	0	0
3	60	5	2	12500	0	1	0
4	25	1	2	14566	0	1	1
...	...	...	...	...	...	...	...
995	22	8	2	18536	0	1	0
996	40	6	3	12711	1	0	0
997	27	5	3	17595	1	0	1
998	61	4	3	16273	1	1	0
999	19	8	3	16284	0	1	0

	age	time_spent	platform	income	in_debt	home_owner	car_owner
0	56	3	2	19774	1	0	0
1	46	2	1	10564	1	1	1
2	32	8	2	13258	0	0	0
3	60	5	2	12500	0	1	0
4	25	1	2	14566	0	1	1
...	...	...	...	...	...	...	...
995	22	8	2	18536	0	1	0
996	40	6	3	12711	1	0	0
997	27	5	3	17595	1	0	1
998	61	4	3	16273	1	1	0
999	19	8	3	16284	0	1	0

Project Social Media

Hypotheses¶

Goals¶

Preparation¶

Preparing table for counting correlation¶

Counting correlation¶

Using K-Neighbour Regression and K-Neighbour Classifier models¶

Conclusions¶

	age	time_spent	platform	income	in_debt	home_owner	car_owner
age	1.000000	-0.033827	0.011086	-0.087391	-0.017055	-0.005321	0.006921
time_spent	-0.033827	1.000000	-0.029979	0.004757	0.013079	0.029388	-0.020271
platform	0.011086	-0.029979	1.000000	-0.007061	0.008947	0.043415	0.036720
income	-0.087391	0.004757	-0.007061	1.000000	0.037860	0.006072	0.019789
in_debt	-0.017055	0.013079	0.008947	0.037860	1.000000	0.038102	-0.035641
home_owner	-0.005321	0.029388	0.043415	0.006072	0.038102	1.000000	-0.051411
car_owner	0.006921	-0.020271	0.036720	0.019789	-0.035641	-0.051411	1.000000