Duomenys paimti iš tarptautinės filmų duomenų bazės, IMDb: https://datasets.imdbws.com/
import sys
!{sys.executable} -m pip install langid
!{sys.executable} -m pip install iso-639
!{sys.executable} -m pip install -U pandasql
import pandas as pd
import langid
from iso639 import languages
from pandasql import sqldf
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
films_original = pd.read_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\title.basics.tsv\\data.tsv', sep='\t', dtype='str', na_values=[r'\N'])
ratings_original = pd.read_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\title.ratings.tsv\\data.tsv', sep='\t', na_values=[r'\N'])
films_original
tconst | titleType | primaryTitle | originalTitle | isAdult | startYear | endYear | runtimeMinutes | genres | |
---|---|---|---|---|---|---|---|---|---|
0 | tt0000001 | short | Carmencita | Carmencita | 0 | 1894 | NaN | 1 | Documentary,Short |
1 | tt0000002 | short | Le clown et ses chiens | Le clown et ses chiens | 0 | 1892 | NaN | 5 | Animation,Short |
2 | tt0000003 | short | Pauvre Pierrot | Pauvre Pierrot | 0 | 1892 | NaN | 4 | Animation,Comedy,Romance |
3 | tt0000004 | short | Un bon bock | Un bon bock | 0 | 1892 | NaN | 12 | Animation,Short |
4 | tt0000005 | short | Blacksmith Scene | Blacksmith Scene | 0 | 1893 | NaN | 1 | Comedy,Short |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8578739 | tt9916848 | tvEpisode | Episode #3.17 | Episode #3.17 | 0 | 2010 | NaN | NaN | Action,Drama,Family |
8578740 | tt9916850 | tvEpisode | Episode #3.19 | Episode #3.19 | 0 | 2010 | NaN | NaN | Action,Drama,Family |
8578741 | tt9916852 | tvEpisode | Episode #3.20 | Episode #3.20 | 0 | 2010 | NaN | NaN | Action,Drama,Family |
8578742 | tt9916856 | short | The Wind | The Wind | 0 | 2015 | NaN | 27 | Short |
8578743 | tt9916880 | tvEpisode | Horrid Henry Knows It All | Horrid Henry Knows It All | 0 | 2014 | NaN | 10 | Adventure,Animation,Comedy |
8578744 rows × 9 columns
ratings_original
tconst | averageRating | numVotes | |
---|---|---|---|
0 | tt0000001 | 5.7 | 1846 |
1 | tt0000002 | 6.0 | 238 |
2 | tt0000003 | 6.5 | 1611 |
3 | tt0000004 | 6.1 | 154 |
4 | tt0000005 | 6.2 | 2436 |
... | ... | ... | ... |
1201926 | tt9916682 | 5.6 | 5 |
1201927 | tt9916690 | 6.6 | 5 |
1201928 | tt9916720 | 5.5 | 164 |
1201929 | tt9916766 | 6.7 | 18 |
1201930 | tt9916778 | 7.3 | 33 |
1201931 rows × 3 columns
films_original['titleType'].unique()
array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie', 'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'], dtype=object)
films = films_original[films_original['titleType'].isin (['movie', 'tvMovie'])]
films = pd.merge(films[['tconst', 'originalTitle', 'startYear', 'runtimeMinutes', 'genres']], ratings_original, on = 'tconst')
films = films[films['numVotes'] >= 100]
films
tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | |
---|---|---|---|---|---|---|---|
1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 |
34 | tt0001258 | Den hvide slavehandel | 1910 | 45 | Drama | 5.7 | 139 |
60 | tt0001892 | Den sorte drøm | 1911 | 53 | Drama | 5.9 | 220 |
68 | tt0002101 | Cleopatra | 1912 | 100 | Drama,History | 5.1 | 524 |
69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 |
... | ... | ... | ... | ... | ... | ... | ... |
321422 | tt9914286 | Sokagin Çocuklari | 2019 | 98 | Drama,Family | 7.6 | 218 |
321426 | tt9914942 | La vida sense la Sara Amat | 2019 | 74 | Drama | 6.9 | 136 |
321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 |
321438 | tt9916270 | Il talento del calabrone | 2020 | 84 | Thriller | 5.8 | 1260 |
321439 | tt9916362 | Akelarre | 2020 | 92 | Drama,History | 6.4 | 4226 |
122480 rows × 7 columns
vertimas = []
l = len(films)
for i in range(l):
vertimas.append(langid.classify(films.iloc[i, 1])[0])
films['language'] = vertimas
films
tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
---|---|---|---|---|---|---|---|---|
1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 | en |
34 | tt0001258 | Den hvide slavehandel | 1910 | 45 | Drama | 5.7 | 139 | da |
60 | tt0001892 | Den sorte drøm | 1911 | 53 | Drama | 5.9 | 220 | nb |
68 | tt0002101 | Cleopatra | 1912 | 100 | Drama,History | 5.1 | 524 | sk |
69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 | en |
... | ... | ... | ... | ... | ... | ... | ... | ... |
321422 | tt9914286 | Sokagin Çocuklari | 2019 | 98 | Drama,Family | 7.6 | 218 | tr |
321426 | tt9914942 | La vida sense la Sara Amat | 2019 | 74 | Drama | 6.9 | 136 | es |
321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 | en |
321438 | tt9916270 | Il talento del calabrone | 2020 | 84 | Thriller | 5.8 | 1260 | it |
321439 | tt9916362 | Akelarre | 2020 | 92 | Drama,History | 6.4 | 4226 | et |
122480 rows × 8 columns
films['language'] = films['language'].apply(lambda x: languages.get(alpha2=x).name)
films
tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
---|---|---|---|---|---|---|---|---|
1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 | English |
34 | tt0001258 | Den hvide slavehandel | 1910 | 45 | Drama | 5.7 | 139 | Danish |
60 | tt0001892 | Den sorte drøm | 1911 | 53 | Drama | 5.9 | 220 | Norwegian Bokmål |
68 | tt0002101 | Cleopatra | 1912 | 100 | Drama,History | 5.1 | 524 | Slovak |
69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 | English |
... | ... | ... | ... | ... | ... | ... | ... | ... |
321422 | tt9914286 | Sokagin Çocuklari | 2019 | 98 | Drama,Family | 7.6 | 218 | Turkish |
321426 | tt9914942 | La vida sense la Sara Amat | 2019 | 74 | Drama | 6.9 | 136 | Spanish |
321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 | English |
321438 | tt9916270 | Il talento del calabrone | 2020 | 84 | Thriller | 5.8 | 1260 | Italian |
321439 | tt9916362 | Akelarre | 2020 | 92 | Drama,History | 6.4 | 4226 | Estonian |
122480 rows × 8 columns
films.to_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\movies_final.csv')
rating0 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films group by language order by average_rating desc")
rating0.index += 1
rating0.head(20)
language | average_rating | movie_count | |
---|---|---|---|
1 | Faroese | 7.066667 | 3 |
2 | Lao | 6.650000 | 2 |
3 | Japanese | 6.642857 | 7 |
4 | Croatian | 6.560582 | 378 |
5 | Walloon | 6.532493 | 357 |
6 | Icelandic | 6.515476 | 84 |
7 | Zulu | 6.498333 | 60 |
8 | Javanese | 6.478107 | 169 |
9 | Irish | 6.424324 | 111 |
10 | Slovak | 6.411358 | 405 |
11 | Occitan (post 1500) | 6.402542 | 118 |
12 | Sinhala | 6.400000 | 1 |
13 | Modern Greek (1453-) | 6.400000 | 4 |
14 | Galician | 6.373418 | 158 |
15 | Aragonese | 6.356250 | 32 |
16 | Vietnamese | 6.347315 | 298 |
17 | Kinyarwanda | 6.340741 | 216 |
18 | Bosnian | 6.320755 | 53 |
19 | Albanian | 6.319767 | 86 |
20 | Catalan | 6.315769 | 260 |
en0 = rating0.copy()
en0 = en0[en0['language'] == 'English']
en0['percentage'] = round(en0['movie_count']/rating0['movie_count'].sum()*100, 1)
en0
language | average_rating | movie_count | percentage | |
---|---|---|---|---|
60 | English | 5.92134 | 72173 | 58.9 |
films[(films['language'] == 'English')]
tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
---|---|---|---|---|---|---|---|---|
index | ||||||||
1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 | English |
69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 | English |
72 | tt0002199 | From the Manger to the Cross; or, Jesus of Naz... | 1912 | 60 | Biography,Drama | 5.8 | 576 | English |
80 | tt0002423 | Madame DuBarry | 1919 | 85 | Biography,Drama,Romance | 6.7 | 869 | English |
83 | tt0002452 | Independenta Romaniei | 1912 | 120 | History,War | 6.5 | 237 | English |
... | ... | ... | ... | ... | ... | ... | ... | ... |
321364 | tt9902160 | Herself | 2020 | 97 | Drama | 7.0 | 3025 | English |
321375 | tt9904802 | Enemy Lines | 2020 | 92 | War | 4.5 | 1477 | English |
321386 | tt9906262 | The Changin' Times of Ike White | 2019 | 77 | Documentary,Music | 6.7 | 202 | English |
321392 | tt9907782 | Eight for Silver | 2021 | 113 | Horror | 6.0 | 285 | English |
321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 | English |
72173 rows × 8 columns
films[(films['language'] == 'Lithuanian')]
tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
---|---|---|---|---|---|---|---|---|
index | ||||||||
82 | tt0002445 | Quo Vadis? | 1913 | 120 | Drama,History | 6.2 | 351 | Lithuanian |
1020 | tt0010208 | Harakiri | 1919 | 80 | Drama | 5.6 | 690 | Lithuanian |
3093 | tt0018199 | Nevada | 1927 | 70 | Western | 6.2 | 121 | Lithuanian |
3865 | tt0020232 | Oblomok imperii | 1929 | 96 | Drama | 7.3 | 397 | Lithuanian |
4798 | tt0022048 | Shukujo to hige | 1931 | 75 | Comedy,Romance | 6.4 | 412 | Lithuanian |
... | ... | ... | ... | ... | ... | ... | ... | ... |
319606 | tt9581076 | Di jiu tianchang | 2019 | 185 | Drama | 7.7 | 3867 | Lithuanian |
319657 | tt9592116 | Aatos ja Amine | 2019 | 73 | Documentary | 7.5 | 208 | Lithuanian |
319757 | tt9614416 | Blackia | 2019 | 141 | Drama | 6.6 | 137 | Lithuanian |
319900 | tt9635540 | Darbar | 2020 | 159 | Action,Crime,Thriller | 6.1 | 5817 | Lithuanian |
320847 | tt9806192 | J'ai perdu mon corps | 2019 | 81 | Animation,Drama,Fantasy | 7.6 | 30605 | Lithuanian |
777 rows × 8 columns
rating1 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=1000 group by language order by average_rating desc")
rating1.index += 1
rating1.head(20)
language | average_rating | movie_count | |
---|---|---|---|
1 | Faroese | 8.100000 | 1 |
2 | Northern Sami | 7.600000 | 1 |
3 | Malagasy | 7.122222 | 45 |
4 | Walloon | 7.041284 | 109 |
5 | Croatian | 7.023333 | 90 |
6 | Irish | 7.004348 | 23 |
7 | Amharic | 7.000000 | 1 |
8 | Galician | 6.993548 | 31 |
9 | Vietnamese | 6.939130 | 92 |
10 | Kinyarwanda | 6.933929 | 56 |
11 | Javanese | 6.929412 | 51 |
12 | Icelandic | 6.922222 | 27 |
13 | Catalan | 6.911940 | 67 |
14 | Swahili (macrolanguage) | 6.832967 | 182 |
15 | Haitian | 6.807692 | 13 |
16 | Latin | 6.784211 | 19 |
17 | Basque | 6.768919 | 148 |
18 | Xhosa | 6.757447 | 47 |
19 | Breton | 6.751351 | 37 |
20 | Aragonese | 6.736364 | 11 |
en1 = rating1.copy()
en1 = en1[en1['language'] == 'English']
en1['percentage'] = round(en1['movie_count']/rating1['movie_count'].sum()*100, 1)
en1
language | average_rating | movie_count | percentage | |
---|---|---|---|---|
62 | English | 6.231188 | 24910 | 64.6 |
rating2 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=10000 group by language order by average_rating desc")
rating2.index += 1
rating2.head(20)
language | average_rating | movie_count | |
---|---|---|---|
1 | Icelandic | 8.500000 | 2 |
2 | Vietnamese | 7.753333 | 15 |
3 | Azerbaijani | 7.700000 | 1 |
4 | Xhosa | 7.657143 | 7 |
5 | Chinese | 7.633333 | 3 |
6 | Luxembourgish | 7.600000 | 1 |
7 | Latin | 7.600000 | 4 |
8 | Breton | 7.570000 | 10 |
9 | Slovak | 7.533333 | 12 |
10 | Norwegian Nynorsk | 7.460000 | 10 |
11 | Galician | 7.385714 | 7 |
12 | Javanese | 7.383333 | 6 |
13 | Walloon | 7.320000 | 15 |
14 | Lithuanian | 7.305405 | 37 |
15 | Esperanto | 7.300000 | 7 |
16 | Malagasy | 7.250000 | 4 |
17 | Romanian | 7.203846 | 26 |
18 | Volapük | 7.200000 | 1 |
19 | Portuguese | 7.181081 | 37 |
20 | Haitian | 7.175000 | 4 |
en2 = rating2.copy()
en2 = en2[en2['language'] == 'English']
en2['percentage'] = round(en2['movie_count']/rating2['movie_count'].sum()*100, 1)
en2
language | average_rating | movie_count | percentage | |
---|---|---|---|---|
50 | English | 6.568707 | 7334 | 75.3 |
rating3 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=100000 group by language order by average_rating desc")
rating3.index += 1
rating3.head(20)
language | average_rating | movie_count | |
---|---|---|---|
1 | Slovak | 8.400000 | 1 |
2 | Latvian | 8.350000 | 2 |
3 | Portuguese | 8.275000 | 4 |
4 | Walloon | 8.200000 | 1 |
5 | Chinese | 8.200000 | 2 |
6 | Vietnamese | 8.100000 | 2 |
7 | Tagalog | 8.100000 | 1 |
8 | Catalan | 8.100000 | 1 |
9 | Galician | 8.000000 | 1 |
10 | Norwegian | 7.950000 | 2 |
11 | Norwegian Nynorsk | 7.700000 | 2 |
12 | Indonesian | 7.540000 | 10 |
13 | Slovenian | 7.520000 | 5 |
14 | Lithuanian | 7.520000 | 5 |
15 | Welsh | 7.500000 | 1 |
16 | Kinyarwanda | 7.500000 | 1 |
17 | French | 7.480000 | 25 |
18 | Maltese | 7.400000 | 5 |
19 | Esperanto | 7.300000 | 1 |
20 | Italian | 7.292683 | 41 |
en3 = rating3.copy()
en3 = en3[en3['language'] == 'English']
en3['percentage'] = round(en3['movie_count']/rating3['movie_count'].sum()*100, 1)
en3
language | average_rating | movie_count | percentage | |
---|---|---|---|---|
30 | English | 7.021573 | 1678 | 80.5 |
rating4 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=1000000 group by language order by average_rating desc")
rating4.index += 1
rating4.head(20)
language | average_rating | movie_count | |
---|---|---|---|
1 | Italian | 8.700000 | 1 |
2 | Maltese | 8.600000 | 1 |
3 | English | 8.535897 | 39 |
4 | French | 8.500000 | 1 |
5 | German | 8.450000 | 2 |
6 | Chinese | 8.400000 | 1 |
7 | Spanish | 8.150000 | 2 |
8 | Polish | 8.100000 | 2 |
9 | Norwegian Nynorsk | 8.100000 | 1 |
en4 = rating4.copy()
en4 = en4[en4['language'] == 'English']
en4['percentage'] = round(en4['movie_count']/rating4['movie_count'].sum()*100, 1)
en4
language | average_rating | movie_count | percentage | |
---|---|---|---|---|
3 | English | 8.535897 | 39 | 78.0 |
Kalbų atpažinimo algoritmas nėra tikslus (ypač mažesnėms kalboms, pvz. lietuvių), bet galime daryti išvadas apie bendras tendencijas, susijusias su angliškais filmais.
historical = films.groupby('startYear', as_index=False)['averageRating'].mean()
historical.tail(20)
startYear | averageRating | |
---|---|---|
95 | 2003 | 5.968877 |
96 | 2004 | 5.973783 |
97 | 2005 | 5.913279 |
98 | 2006 | 5.857472 |
99 | 2007 | 5.911971 |
100 | 2008 | 5.854835 |
101 | 2009 | 5.829136 |
102 | 2010 | 5.838862 |
103 | 2011 | 5.842653 |
104 | 2012 | 5.854820 |
105 | 2013 | 5.852284 |
106 | 2014 | 5.875215 |
107 | 2015 | 5.850643 |
108 | 2016 | 5.882517 |
109 | 2017 | 5.879383 |
110 | 2018 | 5.865897 |
111 | 2019 | 5.894913 |
112 | 2020 | 5.753712 |
113 | 2021 | 6.069348 |
114 | 2022 | 6.466667 |
fig, ax = plt.subplots()
ax.set_title('Vidutinis filmų reitingas 1900 - 2021', fontdict = {'fontsize' : 24})
ax.set_xlim(1900, 2021)
ax.xaxis.set_major_locator(MultipleLocator(10))
ax.plot(historical['startYear'], historical['averageRating'])
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Metai', fontdict = {'fontsize' : 16})
plt.ylabel('Reitingas', fontdict = {'fontsize' : 16})
plt.show()
historical2 = historical
historical2= historical2[historical2['startYear'] >= 2010]
fig, ax = plt.subplots()
ax.set_title('Vidutinis filmų reitingas 2010 - 2021', fontdict = {'fontsize' : 24})
ax.set_xlim(2010, 2021)
ax.plot(historical2['startYear'], historical2['averageRating'])
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Metai', fontdict = {'fontsize' : 16})
plt.ylabel('Reitingas', fontdict = {'fontsize' : 16})
plt.show()