import numpy as np
import pandas as pd
import mysql.connector
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
try:
mydb = mysql.connector.connect(
host = 'localhost',
port = '3306',
user = 'root',
password = 'root'
)
songs = pd.read_sql('SELECT year, country, style, language, final_place, final_total_points FROM eurovision.song_data', con=mydb)
except mysql.connector.Error as err:
# Catch the error and load data from CSV file instead
print("Error: {}".format(err))
# Load data from CSV file into a Pandas dataframe
songs = pd.read_csv('song_data.csv')
C:\Users\Rytis\AppData\Local\Temp\ipykernel_19420\1171909835.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. songs = pd.read_sql('SELECT year, country, style, language, final_place, final_total_points FROM eurovision.song_data', con=mydb)
songs
year | country | style | language | final_place | final_total_points | |
---|---|---|---|---|---|---|
0 | 2022 | Albania | Pop | Albanian, English | NaN | NaN |
1 | 2022 | Latvia | Pop | English | NaN | NaN |
2 | 2022 | Lithuania | Ballad | Lithuanian | 14.0 | 128.0 |
3 | 2022 | Switzerland | Ballad | English | 17.0 | 78.0 |
4 | 2022 | Slovenia | Pop | Slovene | NaN | NaN |
... | ... | ... | ... | ... | ... | ... |
523 | 2009 | France | Ballad | French | 8.0 | 218.0 |
524 | 2009 | Russia | Ballad | Russian, Ukrainian | 11.0 | 185.0 |
525 | 2009 | Germany | Pop | English | 20.0 | 91.0 |
526 | 2009 | United Kingdom | Ballad | English | 5.0 | 328.0 |
527 | 2009 | Spain | Dance | Spanish, English | 24.0 | 47.0 |
528 rows × 6 columns
songs = songs.dropna(subset=['final_place'])
songs = songs.reset_index(drop=True)
songs = songs.replace("English6", "English")
songs = songs.replace("Greek1", "Greek")
songs = songs.replace("Italian[f]", "Italian")
songs
year | country | style | language | final_place | final_total_points | Mean Value | |
---|---|---|---|---|---|---|---|
0 | 2022 | Lithuania | Ballad | Lithuanian | 14.0 | 128.0 | NaN |
1 | 2022 | Switzerland | Ballad | English | 17.0 | 78.0 | NaN |
2 | 2022 | Ukraine | Traditional | Ukrainian | 1.0 | 631.0 | NaN |
3 | 2022 | Netherlands | Ballad | Dutch | 11.0 | 171.0 | NaN |
4 | 2022 | Moldova | Traditional | Romanian, English | 7.0 | 253.0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... |
327 | 2009 | France | Ballad | French | 8.0 | 218.0 | NaN |
328 | 2009 | Russia | Ballad | Russian, Ukrainian | 11.0 | 185.0 | NaN |
329 | 2009 | Germany | Pop | English | 20.0 | 91.0 | NaN |
330 | 2009 | United Kingdom | Ballad | English | 5.0 | 328.0 | NaN |
331 | 2009 | Spain | Dance | Spanish, English | 24.0 | 47.0 | NaN |
332 rows × 7 columns
winners = songs.loc[songs['final_place'] == 1]
winners
year | country | style | language | final_place | final_total_points | |
---|---|---|---|---|---|---|
2 | 2022 | Ukraine | Traditional | Ukrainian | 1.0 | 631.0 |
48 | 2021 | Italy | Rock | Italian | 1.0 | 524.0 |
66 | 2019 | Netherlands | Ballad | English | 1.0 | 498.0 |
78 | 2018 | Israel | Pop | English | 1.0 | 529.0 |
105 | 2017 | Portugal | Ballad | Portuguese | 1.0 | 758.0 |
144 | 2016 | Ukraine | Pop | English, Crimean Tatar | 1.0 | 534.0 |
169 | 2015 | Sweden | Pop | English | 1.0 | 625.0 |
192 | 2014 | Austria | Ballad | English | 1.0 | 520.0 |
206 | 2013 | Denmark | Pop | English | 1.0 | 0.0 |
245 | 2012 | Sweden | Dance | English | 1.0 | 639.0 |
265 | 2011 | Azerbaijan | Ballad | English | 1.0 | 405.0 |
306 | 2010 | Germany | Pop | English | 1.0 | 430.0 |
318 | 2009 | Norway | Traditional | English | 1.0 | 690.0 |
styleCounts = songs['style'].value_counts()
styleCounts
style Pop 150 Ballad 91 Dance 38 Traditional 26 Rock 24 Opera 3 Name: count, dtype: int64
groupedSongs = songs.groupby(['year', 'style']).size().reset_index(name='count')
groupedSongs
year | style | count | |
---|---|---|---|
0 | 2009 | Ballad | 9 |
1 | 2009 | Dance | 5 |
2 | 2009 | Pop | 5 |
3 | 2009 | Rock | 1 |
4 | 2009 | Traditional | 5 |
... | ... | ... | ... |
57 | 2022 | Ballad | 12 |
58 | 2022 | Dance | 1 |
59 | 2022 | Pop | 6 |
60 | 2022 | Rock | 1 |
61 | 2022 | Traditional | 5 |
62 rows × 3 columns
# create a scatter plot
plt.scatter(x=groupedSongs['year'], y=groupedSongs['style'], s=groupedSongs['count']*20)
# set axis labels and title
plt.xlabel('Year')
plt.ylabel('Style')
plt.title('Occurrence of Style by Year')
# show the plot
plt.show()
groupedWinners = winners.groupby(['style']).size().reset_index(name='count')
groupedWinners
style | count | |
---|---|---|
0 | Ballad | 4 |
1 | Dance | 1 |
2 | Pop | 5 |
3 | Rock | 1 |
4 | Traditional | 2 |
Pop žanras yra ko ne pastoviai populiariausias žanras kiekvienais metais pagal dainų pasiekusiu finalą kiekį. Taip pat pop dainos yra laimėjusios daugiausiai eurovizijų.
# group by year and style and compute mean
mean_by_year_style = songs.groupby(['year', 'style'])['final_total_points'].mean()
# create new DataFrame with mean values
meanSongs = pd.DataFrame(mean_by_year_style).reset_index()
meanSongs.columns = ['year', 'style', 'mean_by_year_style']
meanSongs
year | style | mean_by_year_style | |
---|---|---|---|
0 | 2009 | Ballad | 213.111111 |
1 | 2009 | Dance | 162.000000 |
2 | 2009 | Pop | 151.600000 |
3 | 2009 | Rock | 160.000000 |
4 | 2009 | Traditional | 245.200000 |
... | ... | ... | ... |
57 | 2022 | Ballad | 162.833333 |
58 | 2022 | Dance | 38.000000 |
59 | 2022 | Pop | 229.500000 |
60 | 2022 | Rock | 38.000000 |
61 | 2022 | Traditional | 246.600000 |
62 rows × 3 columns
# plot mean values by year and style
fig, ax = plt.subplots(figsize=(8, 6))
for style, data in meanSongs.groupby('style'):
ax.plot(data['year'], data['mean_by_year_style'], label=style)
ax.legend()
ax.set_xlabel('Year')
ax.set_ylabel('Mean Value')
ax.set_title('Mean Value by Year and Style')
plt.show()
Dėl didėlio pop dainų kiekio vidurkis yra mažesnis negu kitų dainų vidurkiai.
languageSongs = songs.groupby(['year', 'language'])['final_total_points'].mean().sort_values(ascending=False)
print(languageSongs.to_string())
year language 2017 Portuguese 758.000000 2022 Ukrainian 631.000000 2016 English, Crimean Tatar 534.000000 2015 Italian 527.000000 2021 Italian 524.000000 2019 Italian 472.000000 2021 French 465.500000 2022 Spanish, English 459.000000 2012 Udmurt, English 426.000000 Serbian 384.000000 2021 Ukrainian 364.000000 2011 Italian, English 350.000000 2017 Italian 334.000000 2022 Serbian, Latin 312.000000 2016 English, Bulgarian 307.000000 2018 Italian 276.500000 2022 Italian 268.000000 2012 Albanian 263.000000 2010 Greek 262.000000 2011 English, Greek 260.000000 2016 French, English 257.000000 2022 Romanian, English 253.000000 2009 Estonian 253.000000 English 239.615385 2019 Icelandic 232.000000 2012 Estonian 230.000000 2018 English 218.117647 2009 French 218.000000 Bosnian 214.000000 2012 English, Italian 213.000000 2022 English, Portuguese 207.000000 2021 Russian, English 204.000000 2011 Serbian 200.000000 2017 Hungarian 200.000000 2012 Spanish 199.000000 2019 English 193.833333 2011 English 187.388889 2014 Polish, English 185.000000 2010 French 185.000000 2009 Russian, Ukrainian 185.000000 2018 Albanian 184.000000 2009 English, Armenian 182.000000 2010 English 179.052632 2018 French 173.000000 2022 Dutch 171.000000 2012 Spanish, English 170.000000 2017 English 169.947368 2016 English 166.619048 2011 Corsican 166.000000 2012 English 164.125000 2011 English, Russian 163.000000 2014 English 161.250000 2010 Hebrew 161.000000 2015 English 160.190476 2009 Romanian, English 159.000000 2016 French 151.000000 2022 English 150.642857 2010 Spanish 149.000000 Serbian 147.000000 2017 French, English 135.000000 2022 Lithuanian 128.000000 2017 English, Italian 128.000000 2021 English 126.466667 2016 Italian, English 124.000000 2011 English, Hungarian 124.000000 2009 English, Hebrew, Arabic 122.000000 2019 English, French 120.000000 2009 Croatian 113.000000 2018 Serbian 113.000000 2014 English, Spanish 112.000000 2011 Spanish 111.000000 2009 Portuguese 109.000000 2019 Slovene 105.000000 French, English 105.000000 2021 Serbian 102.000000 2010 Portuguese 93.000000 2018 Hungarian 93.000000 2019 Albanian 90.000000 2015 Romanian, English 90.000000 2019 Serbian 89.000000 2009 French, English 86.000000 2012 French, English 85.000000 2017 Belarusian 83.000000 2015 Montenegrin 78.000000 2014 Montenegrin 75.000000 2009 English, Russian 69.000000 2022 English, Spanish 65.000000 2018 Slovene 64.000000 Spanish 61.000000 2014 Italian 59.000000 2021 Albanian 57.000000 2019 Spanish 54.000000 2009 Spanish, English 47.000000 2018 Portuguese 39.000000 2014 English, Slovene 36.000000 2015 Spanish 32.000000 French 27.000000 2021 English, Sranan Tongo 23.000000 2022 Icelandic 20.000000 Breton 17.000000 2021 Spanish 6.000000 2014 French 6.000000 2017 Spanish, English 5.000000 2013 Romanian 0.000000 Italian 0.000000 Icelandic 0.000000 Hungarian 0.000000 Spanish 0.000000 French 0.000000 Estonian 0.000000 English 0.000000 2012 Macedonian 0.000000 Bosnian 0.000000 2013 Greek 0.000000
languageSongsbyStyle = songs.groupby(['language'])['final_total_points'].mean().sort_values(ascending=False)
print(languageSongsbyStyle.to_string())
language English, Crimean Tatar 534.000000 Ukrainian 497.500000 Udmurt, English 426.000000 Serbian, Latin 312.000000 English, Bulgarian 307.000000 Italian 304.111111 English, Greek 260.000000 Portuguese 249.750000 Italian, English 237.000000 English, Portuguese 207.000000 Russian, English 204.000000 French 187.888889 Russian, Ukrainian 185.000000 Polish, English 185.000000 English, Armenian 182.000000 Serbian 172.500000 Dutch 171.000000 English, Italian 170.500000 Spanish, English 170.250000 Romanian, English 167.333333 Corsican 166.000000 English 161.729258 Hebrew 161.000000 Estonian 161.000000 Albanian 148.500000 French, English 133.600000 Greek 131.000000 Lithuanian 128.000000 English, Hungarian 124.000000 English, Hebrew, Arabic 122.000000 English, French 120.000000 English, Russian 116.000000 Croatian 113.000000 Bosnian 107.000000 Hungarian 97.666667 English, Spanish 88.500000 Slovene 84.500000 Icelandic 84.000000 Belarusian 83.000000 Montenegrin 76.500000 Spanish 76.500000 English, Slovene 36.000000 English, Sranan Tongo 23.000000 Breton 17.000000 Macedonian 0.000000 Romanian 0.000000
languageSongsbyStyle.describe()
count 46.000000 mean 167.159332 std 113.433223 min 0.000000 25% 100.000000 50% 161.000000 75% 187.166667 max 534.000000 Name: final_total_points, dtype: float64
songs.value_counts()
Series([], Name: count, dtype: int64)
languageSongs = songs.groupby(['year', 'language'])['final_total_points'].mean().unstack()
languageSongs.plot(kind='bar', stacked=True)
plt.xlabel('Year')
plt.ylabel('Mean final total points')
plt.title('Mean total points by year and language')
plt.legend(title='Language', bbox_to_anchor=(1.05, 1))
plt.show()
Gryna anglų kalbą yra tarp šiek tiek virš 50 procentilio. Tačiau jeigu sujungus ukrainiečių kalbas, junginys būtų pats auksčiaučias tašku vidurkis tarp duomenų rinkinio.
new_data = pd.DataFrame({'year': [2023], 'style': ['Rock']})
# Prepare the data
X = songs[['year', 'style']] # Features
y = songs['final_place'] # Target
# Create a Linear Regression model object
model = LinearRegression()
# Encode the categorical variable
le = LabelEncoder()
X['style'] = le.fit_transform(X['style'])
# Train the model
model.fit(X, y)
# Print the coefficients
print(model.coef_)
# Make predictions on new data
new_data['style'] = le.transform(new_data['style'])
prediction = model.predict(new_data)
print(prediction)
[0.00253852 0.14683223] [13.67136886]
C:\Users\Rytis\AppData\Local\Temp\ipykernel_19420\3447968430.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X['style'] = le.fit_transform(X['style'])