import numpy as np
import pandas as pd
import mysql.connector
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder


try:
    mydb = mysql.connector.connect(
        host = 'localhost',
        port = '3306',
        user = 'root',
        password = 'root'
    )
    songs = pd.read_sql('SELECT year, country, style, language, final_place, final_total_points FROM eurovision.song_data', con=mydb)
except mysql.connector.Error as err:
    # Catch the error and load data from CSV file instead
    print("Error: {}".format(err))

    # Load data from CSV file into a Pandas dataframe
    songs = pd.read_csv('song_data.csv')

C:\Users\Rytis\AppData\Local\Temp\ipykernel_19420\1171909835.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  songs = pd.read_sql('SELECT year, country, style, language, final_place, final_total_points FROM eurovision.song_data', con=mydb)


songs


songs = songs.dropna(subset=['final_place'])
songs = songs.reset_index(drop=True)
songs = songs.replace("English6", "English")
songs = songs.replace("Greek1", "Greek")
songs = songs.replace("Italian[f]", "Italian")
songs


winners = songs.loc[songs['final_place'] == 1]
winners


styleCounts = songs['style'].value_counts()
styleCounts

style
Pop            150
Ballad          91
Dance           38
Traditional     26
Rock            24
Opera            3
Name: count, dtype: int64


groupedSongs = songs.groupby(['year', 'style']).size().reset_index(name='count')
groupedSongs


# create a scatter plot
plt.scatter(x=groupedSongs['year'], y=groupedSongs['style'], s=groupedSongs['count']*20)

# set axis labels and title
plt.xlabel('Year')
plt.ylabel('Style')
plt.title('Occurrence of Style by Year')

# show the plot
plt.show()


groupedWinners = winners.groupby(['style']).size().reset_index(name='count')
groupedWinners


# group by year and style and compute mean
mean_by_year_style = songs.groupby(['year', 'style'])['final_total_points'].mean()

# create new DataFrame with mean values
meanSongs = pd.DataFrame(mean_by_year_style).reset_index()
meanSongs.columns = ['year', 'style', 'mean_by_year_style']

meanSongs


# plot mean values by year and style
fig, ax = plt.subplots(figsize=(8, 6))
for style, data in meanSongs.groupby('style'):
    ax.plot(data['year'], data['mean_by_year_style'], label=style)
ax.legend()
ax.set_xlabel('Year')
ax.set_ylabel('Mean Value')
ax.set_title('Mean Value by Year and Style')
plt.show()


languageSongs = songs.groupby(['year', 'language'])['final_total_points'].mean().sort_values(ascending=False)
print(languageSongs.to_string())

year  language               
2017  Portuguese                 758.000000
2022  Ukrainian                  631.000000
2016  English, Crimean Tatar     534.000000
2015  Italian                    527.000000
2021  Italian                    524.000000
2019  Italian                    472.000000
2021  French                     465.500000
2022  Spanish, English           459.000000
2012  Udmurt, English            426.000000
      Serbian                    384.000000
2021  Ukrainian                  364.000000
2011  Italian, English           350.000000
2017  Italian                    334.000000
2022  Serbian, Latin             312.000000
2016  English, Bulgarian         307.000000
2018  Italian                    276.500000
2022  Italian                    268.000000
2012  Albanian                   263.000000
2010  Greek                      262.000000
2011  English, Greek             260.000000
2016  French, English            257.000000
2022  Romanian, English          253.000000
2009  Estonian                   253.000000
      English                    239.615385
2019  Icelandic                  232.000000
2012  Estonian                   230.000000
2018  English                    218.117647
2009  French                     218.000000
      Bosnian                    214.000000
2012  English, Italian           213.000000
2022  English, Portuguese        207.000000
2021  Russian, English           204.000000
2011  Serbian                    200.000000
2017  Hungarian                  200.000000
2012  Spanish                    199.000000
2019  English                    193.833333
2011  English                    187.388889
2014  Polish, English            185.000000
2010  French                     185.000000
2009  Russian, Ukrainian         185.000000
2018  Albanian                   184.000000
2009  English, Armenian          182.000000
2010  English                    179.052632
2018  French                     173.000000
2022  Dutch                      171.000000
2012  Spanish, English           170.000000
2017  English                    169.947368
2016  English                    166.619048
2011  Corsican                   166.000000
2012  English                    164.125000
2011  English, Russian           163.000000
2014  English                    161.250000
2010  Hebrew                     161.000000
2015  English                    160.190476
2009  Romanian, English          159.000000
2016  French                     151.000000
2022  English                    150.642857
2010  Spanish                    149.000000
      Serbian                    147.000000
2017  French, English            135.000000
2022  Lithuanian                 128.000000
2017  English, Italian           128.000000
2021  English                    126.466667
2016  Italian, English           124.000000
2011  English, Hungarian         124.000000
2009  English, Hebrew, Arabic    122.000000
2019  English, French            120.000000
2009  Croatian                   113.000000
2018  Serbian                    113.000000
2014  English, Spanish           112.000000
2011  Spanish                    111.000000
2009  Portuguese                 109.000000
2019  Slovene                    105.000000
      French, English            105.000000
2021  Serbian                    102.000000
2010  Portuguese                  93.000000
2018  Hungarian                   93.000000
2019  Albanian                    90.000000
2015  Romanian, English           90.000000
2019  Serbian                     89.000000
2009  French, English             86.000000
2012  French, English             85.000000
2017  Belarusian                  83.000000
2015  Montenegrin                 78.000000
2014  Montenegrin                 75.000000
2009  English, Russian            69.000000
2022  English, Spanish            65.000000
2018  Slovene                     64.000000
      Spanish                     61.000000
2014  Italian                     59.000000
2021  Albanian                    57.000000
2019  Spanish                     54.000000
2009  Spanish, English            47.000000
2018  Portuguese                  39.000000
2014  English, Slovene            36.000000
2015  Spanish                     32.000000
      French                      27.000000
2021  English, Sranan Tongo       23.000000
2022  Icelandic                   20.000000
      Breton                      17.000000
2021  Spanish                      6.000000
2014  French                       6.000000
2017  Spanish, English             5.000000
2013  Romanian                     0.000000
      Italian                      0.000000
      Icelandic                    0.000000
      Hungarian                    0.000000
      Spanish                      0.000000
      French                       0.000000
      Estonian                     0.000000
      English                      0.000000
2012  Macedonian                   0.000000
      Bosnian                      0.000000
2013  Greek                        0.000000


languageSongsbyStyle = songs.groupby(['language'])['final_total_points'].mean().sort_values(ascending=False)
print(languageSongsbyStyle.to_string())

language
English, Crimean Tatar     534.000000
Ukrainian                  497.500000
Udmurt, English            426.000000
Serbian, Latin             312.000000
English, Bulgarian         307.000000
Italian                    304.111111
English, Greek             260.000000
Portuguese                 249.750000
Italian, English           237.000000
English, Portuguese        207.000000
Russian, English           204.000000
French                     187.888889
Russian, Ukrainian         185.000000
Polish, English            185.000000
English, Armenian          182.000000
Serbian                    172.500000
Dutch                      171.000000
English, Italian           170.500000
Spanish, English           170.250000
Romanian, English          167.333333
Corsican                   166.000000
English                    161.729258
Hebrew                     161.000000
Estonian                   161.000000
Albanian                   148.500000
French, English            133.600000
Greek                      131.000000
Lithuanian                 128.000000
English, Hungarian         124.000000
English, Hebrew, Arabic    122.000000
English, French            120.000000
English, Russian           116.000000
Croatian                   113.000000
Bosnian                    107.000000
Hungarian                   97.666667
English, Spanish            88.500000
Slovene                     84.500000
Icelandic                   84.000000
Belarusian                  83.000000
Montenegrin                 76.500000
Spanish                     76.500000
English, Slovene            36.000000
English, Sranan Tongo       23.000000
Breton                      17.000000
Macedonian                   0.000000
Romanian                     0.000000


languageSongsbyStyle.describe()

count     46.000000
mean     167.159332
std      113.433223
min        0.000000
25%      100.000000
50%      161.000000
75%      187.166667
max      534.000000
Name: final_total_points, dtype: float64


songs.value_counts()

Series([], Name: count, dtype: int64)


languageSongs = songs.groupby(['year', 'language'])['final_total_points'].mean().unstack()
languageSongs.plot(kind='bar', stacked=True)

plt.xlabel('Year')
plt.ylabel('Mean final total points')
plt.title('Mean total points by year and language')
plt.legend(title='Language', bbox_to_anchor=(1.05, 1))

plt.show()


new_data = pd.DataFrame({'year': [2023], 'style': ['Rock']})


# Prepare the data
X = songs[['year', 'style']] # Features
y = songs['final_place'] # Target

# Create a Linear Regression model object
model = LinearRegression()

# Encode the categorical variable 
le = LabelEncoder()
X['style'] = le.fit_transform(X['style'])

# Train the model
model.fit(X, y)

# Print the coefficients
print(model.coef_)

# Make predictions on new data
new_data['style'] = le.transform(new_data['style'])
prediction = model.predict(new_data)
print(prediction)

[0.00253852 0.14683223]
[13.67136886]

C:\Users\Rytis\AppData\Local\Temp\ipykernel_19420\3447968430.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['style'] = le.fit_transform(X['style'])

Hipotezės¶

Duomenų valymas¶

Laimėtojų sąrašas¶

Populiariausi žanrai¶

Pirmos Hipotezės išvados¶

Žanrų vidutiniai taškai¶

Antros Hipotezės išvados¶

Taškai pagal dainos kalbą¶

Trečios hipotezės išvados¶

Mašininis modelis¶

	year	country	style	language	final_place	final_total_points
0	2022	Albania	Pop	Albanian, English	NaN	NaN
1	2022	Latvia	Pop	English	NaN	NaN
2	2022	Lithuania	Ballad	Lithuanian	14.0	128.0
3	2022	Switzerland	Ballad	English	17.0	78.0
4	2022	Slovenia	Pop	Slovene	NaN	NaN
...	...	...	...	...	...	...
523	2009	France	Ballad	French	8.0	218.0
524	2009	Russia	Ballad	Russian, Ukrainian	11.0	185.0
525	2009	Germany	Pop	English	20.0	91.0
526	2009	United Kingdom	Ballad	English	5.0	328.0
527	2009	Spain	Dance	Spanish, English	24.0	47.0

	year	country	style	language	final_place	final_total_points
2	2022	Ukraine	Traditional	Ukrainian	1.0	631.0
48	2021	Italy	Rock	Italian	1.0	524.0
66	2019	Netherlands	Ballad	English	1.0	498.0
78	2018	Israel	Pop	English	1.0	529.0
105	2017	Portugal	Ballad	Portuguese	1.0	758.0
144	2016	Ukraine	Pop	English, Crimean Tatar	1.0	534.0
169	2015	Sweden	Pop	English	1.0	625.0
192	2014	Austria	Ballad	English	1.0	520.0
206	2013	Denmark	Pop	English	1.0	0.0
245	2012	Sweden	Dance	English	1.0	639.0
265	2011	Azerbaijan	Ballad	English	1.0	405.0
306	2010	Germany	Pop	English	1.0	430.0
318	2009	Norway	Traditional	English	1.0	690.0

	year	style	mean_by_year_style
0	2009	Ballad	213.111111
1	2009	Dance	162.000000
2	2009	Pop	151.600000
3	2009	Rock	160.000000
4	2009	Traditional	245.200000
...	...	...	...
57	2022	Ballad	162.833333
58	2022	Dance	38.000000
59	2022	Pop	229.500000
60	2022	Rock	38.000000
61	2022	Traditional	246.600000