import sys
!{sys.executable} -m pip install langid
!{sys.executable} -m pip install iso-639
!{sys.executable} -m pip install -U pandasql


import pandas as pd
import langid
from iso639 import languages
from pandasql import sqldf
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator


films_original = pd.read_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\title.basics.tsv\\data.tsv', sep='\t', dtype='str', na_values=[r'\N'])
ratings_original = pd.read_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\title.ratings.tsv\\data.tsv', sep='\t', na_values=[r'\N'])


films_original


ratings_original


films_original['titleType'].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)


films = films_original[films_original['titleType'].isin (['movie', 'tvMovie'])]
films = pd.merge(films[['tconst', 'originalTitle', 'startYear', 'runtimeMinutes', 'genres']], ratings_original, on = 'tconst')
films = films[films['numVotes'] >= 100] 
films


vertimas = []
l = len(films)
for i in range(l):
    vertimas.append(langid.classify(films.iloc[i, 1])[0])


films['language'] = vertimas
films


films['language'] = films['language'].apply(lambda x: languages.get(alpha2=x).name)


films


films.to_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\movies_final.csv')


rating0 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films group by language order by average_rating desc")
rating0.index += 1
rating0.head(20)


en0 = rating0.copy()
en0 = en0[en0['language'] == 'English']
en0['percentage'] = round(en0['movie_count']/rating0['movie_count'].sum()*100, 1)
en0


films[(films['language'] == 'English')]


films[(films['language'] == 'Lithuanian')]


rating1 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=1000 group by language order by average_rating desc")
rating1.index += 1
rating1.head(20)


en1 = rating1.copy()
en1 = en1[en1['language'] == 'English']
en1['percentage'] = round(en1['movie_count']/rating1['movie_count'].sum()*100, 1)
en1


rating2 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=10000 group by language order by average_rating desc")
rating2.index += 1
rating2.head(20)


en2 = rating2.copy()
en2 = en2[en2['language'] == 'English']
en2['percentage'] = round(en2['movie_count']/rating2['movie_count'].sum()*100, 1)
en2


rating3 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=100000 group by language order by average_rating desc")
rating3.index += 1
rating3.head(20)


en3 = rating3.copy()
en3 = en3[en3['language'] == 'English']
en3['percentage'] = round(en3['movie_count']/rating3['movie_count'].sum()*100, 1)
en3


rating4 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=1000000 group by language order by average_rating desc")
rating4.index += 1
rating4.head(20)


en4 = rating4.copy()
en4 = en4[en4['language'] == 'English']
en4['percentage'] = round(en4['movie_count']/rating4['movie_count'].sum()*100, 1)
en4


historical = films.groupby('startYear', as_index=False)['averageRating'].mean()
historical.tail(20)


fig, ax = plt.subplots()
ax.set_title('Vidutinis filmų reitingas 1900 - 2021', fontdict = {'fontsize' : 24})
ax.set_xlim(1900, 2021)
ax.xaxis.set_major_locator(MultipleLocator(10))
ax.plot(historical['startYear'], historical['averageRating'])
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Metai', fontdict = {'fontsize' : 16})
plt.ylabel('Reitingas', fontdict = {'fontsize' : 16})
plt.show()


historical2 = historical
historical2= historical2[historical2['startYear'] >= 2010]
fig, ax = plt.subplots()
ax.set_title('Vidutinis filmų reitingas 2010 - 2021', fontdict = {'fontsize' : 24})
ax.set_xlim(2010, 2021)
ax.plot(historical2['startYear'], historical2['averageRating'])
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Metai', fontdict = {'fontsize' : 16})
plt.ylabel('Reitingas', fontdict = {'fontsize' : 16})
plt.show()

	tconst	titleType	primaryTitle	originalTitle	isAdult	startYear	endYear	runtimeMinutes	genres
0	tt0000001	short	Carmencita	Carmencita	0	1894	NaN	1	Documentary,Short
1	tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	0	1892	NaN	5	Animation,Short
2	tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	0	1892	NaN	4	Animation,Comedy,Romance
3	tt0000004	short	Un bon bock	Un bon bock	0	1892	NaN	12	Animation,Short
4	tt0000005	short	Blacksmith Scene	Blacksmith Scene	0	1893	NaN	1	Comedy,Short
...	...	...	...	...	...	...	...	...	...
8578739	tt9916848	tvEpisode	Episode #3.17	Episode #3.17	0	2010	NaN	NaN	Action,Drama,Family
8578740	tt9916850	tvEpisode	Episode #3.19	Episode #3.19	0	2010	NaN	NaN	Action,Drama,Family
8578741	tt9916852	tvEpisode	Episode #3.20	Episode #3.20	0	2010	NaN	NaN	Action,Drama,Family
8578742	tt9916856	short	The Wind	The Wind	0	2015	NaN	27	Short
8578743	tt9916880	tvEpisode	Horrid Henry Knows It All	Horrid Henry Knows It All	0	2014	NaN	10	Adventure,Animation,Comedy

	tconst	averageRating	numVotes
0	tt0000001	5.7	1846
1	tt0000002	6.0	238
2	tt0000003	6.5	1611
3	tt0000004	6.1	154
4	tt0000005	6.2	2436
...	...	...	...
1201926	tt9916682	5.6	5
1201927	tt9916690	6.6	5
1201928	tt9916720	5.5	164
1201929	tt9916766	6.7	18
1201930	tt9916778	7.3	33

	tconst	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes
1	tt0000574	The Story of the Kelly Gang	1906	70	Action,Adventure,Biography	6.1	744
34	tt0001258	Den hvide slavehandel	1910	45	Drama	5.7	139
60	tt0001892	Den sorte drøm	1911	53	Drama	5.9	220
68	tt0002101	Cleopatra	1912	100	Drama,History	5.1	524
69	tt0002130	L'Inferno	1911	71	Adventure,Drama,Fantasy	7.0	2751
...	...	...	...	...	...	...	...
321422	tt9914286	Sokagin Çocuklari	2019	98	Drama,Family	7.6	218
321426	tt9914942	La vida sense la Sara Amat	2019	74	Drama	6.9	136
321436	tt9916190	Safeguard	2020	90	Action,Adventure,Thriller	3.5	227
321438	tt9916270	Il talento del calabrone	2020	84	Thriller	5.8	1260
321439	tt9916362	Akelarre	2020	92	Drama,History	6.4	4226

	tconst	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes	language
1	tt0000574	The Story of the Kelly Gang	1906	70	Action,Adventure,Biography	6.1	744	en
34	tt0001258	Den hvide slavehandel	1910	45	Drama	5.7	139	da
60	tt0001892	Den sorte drøm	1911	53	Drama	5.9	220	nb
68	tt0002101	Cleopatra	1912	100	Drama,History	5.1	524	sk
69	tt0002130	L'Inferno	1911	71	Adventure,Drama,Fantasy	7.0	2751	en
...	...	...	...	...	...	...	...	...
321422	tt9914286	Sokagin Çocuklari	2019	98	Drama,Family	7.6	218	tr
321426	tt9914942	La vida sense la Sara Amat	2019	74	Drama	6.9	136	es
321436	tt9916190	Safeguard	2020	90	Action,Adventure,Thriller	3.5	227	en
321438	tt9916270	Il talento del calabrone	2020	84	Thriller	5.8	1260	it
321439	tt9916362	Akelarre	2020	92	Drama,History	6.4	4226	et

	tconst	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes	language
1	tt0000574	The Story of the Kelly Gang	1906	70	Action,Adventure,Biography	6.1	744	English
34	tt0001258	Den hvide slavehandel	1910	45	Drama	5.7	139	Danish
60	tt0001892	Den sorte drøm	1911	53	Drama	5.9	220	Norwegian Bokmål
68	tt0002101	Cleopatra	1912	100	Drama,History	5.1	524	Slovak
69	tt0002130	L'Inferno	1911	71	Adventure,Drama,Fantasy	7.0	2751	English
...	...	...	...	...	...	...	...	...
321422	tt9914286	Sokagin Çocuklari	2019	98	Drama,Family	7.6	218	Turkish
321426	tt9914942	La vida sense la Sara Amat	2019	74	Drama	6.9	136	Spanish
321436	tt9916190	Safeguard	2020	90	Action,Adventure,Thriller	3.5	227	English
321438	tt9916270	Il talento del calabrone	2020	84	Thriller	5.8	1260	Italian
321439	tt9916362	Akelarre	2020	92	Drama,History	6.4	4226	Estonian

Projekto hipotezės¶

Duomenų apdorojimas¶

Duomenų importavimas, filtravimas¶

Kalbos pridėjimas į lentelę¶

Filmų reitingo ir jų kalbos analizė¶

Kai reitingo balsų kiekis >=100 (visa lentelė)¶

Kai reitingo balsų kiekis >=1 000¶

Kai reitingo balsų kiekis >=10 000¶

Kai reitingo balsų kiekis >=100 000¶

Kai reitingo balsų kiekis >=1 000 000¶

Dalinės išvados¶

Istorinė analizė¶

Grafikai¶

Dalinės išvados¶

Išvados¶

	language	average_rating	movie_count
1	Faroese	7.066667	3
2	Lao	6.650000	2
3	Japanese	6.642857	7
4	Croatian	6.560582	378
5	Walloon	6.532493	357
6	Icelandic	6.515476	84
7	Zulu	6.498333	60
8	Javanese	6.478107	169
9	Irish	6.424324	111
10	Slovak	6.411358	405
11	Occitan (post 1500)	6.402542	118
12	Sinhala	6.400000	1
13	Modern Greek (1453-)	6.400000	4
14	Galician	6.373418	158
15	Aragonese	6.356250	32
16	Vietnamese	6.347315	298
17	Kinyarwanda	6.340741	216
18	Bosnian	6.320755	53
19	Albanian	6.319767	86
20	Catalan	6.315769	260

	tconst	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes	language
index
1	tt0000574	The Story of the Kelly Gang	1906	70	Action,Adventure,Biography	6.1	744	English
69	tt0002130	L'Inferno	1911	71	Adventure,Drama,Fantasy	7.0	2751	English
72	tt0002199	From the Manger to the Cross; or, Jesus of Naz...	1912	60	Biography,Drama	5.8	576	English
80	tt0002423	Madame DuBarry	1919	85	Biography,Drama,Romance	6.7	869	English
83	tt0002452	Independenta Romaniei	1912	120	History,War	6.5	237	English
...	...	...	...	...	...	...	...	...
321364	tt9902160	Herself	2020	97	Drama	7.0	3025	English
321375	tt9904802	Enemy Lines	2020	92	War	4.5	1477	English
321386	tt9906262	The Changin' Times of Ike White	2019	77	Documentary,Music	6.7	202	English
321392	tt9907782	Eight for Silver	2021	113	Horror	6.0	285	English
321436	tt9916190	Safeguard	2020	90	Action,Adventure,Thriller	3.5	227	English

	tconst	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes	language
index
82	tt0002445	Quo Vadis?	1913	120	Drama,History	6.2	351	Lithuanian
1020	tt0010208	Harakiri	1919	80	Drama	5.6	690	Lithuanian
3093	tt0018199	Nevada	1927	70	Western	6.2	121	Lithuanian
3865	tt0020232	Oblomok imperii	1929	96	Drama	7.3	397	Lithuanian
4798	tt0022048	Shukujo to hige	1931	75	Comedy,Romance	6.4	412	Lithuanian
...	...	...	...	...	...	...	...	...
319606	tt9581076	Di jiu tianchang	2019	185	Drama	7.7	3867	Lithuanian
319657	tt9592116	Aatos ja Amine	2019	73	Documentary	7.5	208	Lithuanian
319757	tt9614416	Blackia	2019	141	Drama	6.6	137	Lithuanian
319900	tt9635540	Darbar	2020	159	Action,Crime,Thriller	6.1	5817	Lithuanian
320847	tt9806192	J'ai perdu mon corps	2019	81	Animation,Drama,Fantasy	7.6	30605	Lithuanian

	language	average_rating	movie_count
1	Faroese	8.100000	1
2	Northern Sami	7.600000	1
3	Malagasy	7.122222	45
4	Walloon	7.041284	109
5	Croatian	7.023333	90
6	Irish	7.004348	23
7	Amharic	7.000000	1
8	Galician	6.993548	31
9	Vietnamese	6.939130	92
10	Kinyarwanda	6.933929	56
11	Javanese	6.929412	51
12	Icelandic	6.922222	27
13	Catalan	6.911940	67
14	Swahili (macrolanguage)	6.832967	182
15	Haitian	6.807692	13
16	Latin	6.784211	19
17	Basque	6.768919	148
18	Xhosa	6.757447	47
19	Breton	6.751351	37
20	Aragonese	6.736364	11

	language	average_rating	movie_count
1	Icelandic	8.500000	2
2	Vietnamese	7.753333	15
3	Azerbaijani	7.700000	1
4	Xhosa	7.657143	7
5	Chinese	7.633333	3
6	Luxembourgish	7.600000	1
7	Latin	7.600000	4
8	Breton	7.570000	10
9	Slovak	7.533333	12
10	Norwegian Nynorsk	7.460000	10
11	Galician	7.385714	7
12	Javanese	7.383333	6
13	Walloon	7.320000	15
14	Lithuanian	7.305405	37
15	Esperanto	7.300000	7
16	Malagasy	7.250000	4
17	Romanian	7.203846	26
18	Volapük	7.200000	1
19	Portuguese	7.181081	37
20	Haitian	7.175000	4

	language	average_rating	movie_count
1	Slovak	8.400000	1
2	Latvian	8.350000	2
3	Portuguese	8.275000	4
4	Walloon	8.200000	1
5	Chinese	8.200000	2
6	Vietnamese	8.100000	2
7	Tagalog	8.100000	1
8	Catalan	8.100000	1
9	Galician	8.000000	1
10	Norwegian	7.950000	2
11	Norwegian Nynorsk	7.700000	2
12	Indonesian	7.540000	10
13	Slovenian	7.520000	5
14	Lithuanian	7.520000	5
15	Welsh	7.500000	1
16	Kinyarwanda	7.500000	1
17	French	7.480000	25
18	Maltese	7.400000	5
19	Esperanto	7.300000	1
20	Italian	7.292683	41

	language	average_rating	movie_count
1	Italian	8.700000	1
2	Maltese	8.600000	1
3	English	8.535897	39
4	French	8.500000	1
5	German	8.450000	2
6	Chinese	8.400000	1
7	Spanish	8.150000	2
8	Polish	8.100000	2
9	Norwegian Nynorsk	8.100000	1

	startYear	averageRating
95	2003	5.968877
96	2004	5.973783
97	2005	5.913279
98	2006	5.857472
99	2007	5.911971
100	2008	5.854835
101	2009	5.829136
102	2010	5.838862
103	2011	5.842653
104	2012	5.854820
105	2013	5.852284
106	2014	5.875215
107	2015	5.850643
108	2016	5.882517
109	2017	5.879383
110	2018	5.865897
111	2019	5.894913
112	2020	5.753712
113	2021	6.069348
114	2022	6.466667