# import mysql.connector
# mydb = mysql.connector.connect(
# host="localhost",
# port=3306,
# user="root",
# password="password"
# )
# cursor = mydb.cursor()
# mycursor.execute('CREAT DATABASE Baigiamojo')
# cursor.execute("SHOW DATABASES")
# for i in cursor:
# print(i)
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
nt = pd.read_csv('/Users/indre/Downloads/archive/nyt-articles-2020.csv')
nt = nt.rename(columns={'n_comments':'number_of_comments'})
nt['pub_date'] = pd.to_datetime(nt['pub_date']).dt.date
nt_new = nt[[ 'section', 'headline', 'abstract', 'keywords', 'pub_date', 'number_of_comments']]
nt_new
section | headline | abstract | keywords | pub_date | number_of_comments | |
---|---|---|---|---|---|---|
0 | Opinion | Protect Veterans From Fraud | Congress could do much more to protect America... | ['Veterans', 'For-Profit Schools', 'Financial ... | 2020-01-01 | 186 |
1 | Crosswords & Games | ‘It’s Green and Slimy’ | Christina Iverson and Jeff Chen ring in the Ne... | ['Crossword Puzzles'] | 2020-01-01 | 257 |
2 | Science | Meteor Showers in 2020 That Will Light Up Nigh... | All year long, Earth passes through streams of... | ['Meteors and Meteorites', 'Space and Astronom... | 2020-01-01 | 6 |
3 | Science | Sync your calendar with the solar system | Never miss an eclipse, a meteor shower, a rock... | ['Space and Astronomy', 'Moon', 'Eclipses', 'S... | 2020-01-01 | 2 |
4 | Science | Rocket Launches, Trips to Mars and More 2020 S... | A year full of highs and lows in space just en... | ['Space and Astronomy', 'Private Spaceflight',... | 2020-01-01 | 25 |
... | ... | ... | ... | ... | ... | ... |
16782 | Opinion | What It Takes to Heal From Covid-19 | Survivors can get better, but they need help. | ['Chronic Condition (Health)', 'Coronavirus (2... | 2020-12-31 | 166 |
16783 | Sports | Padres Jolt M.L.B. With Bold Moves to Set Up W... | While many teams continued to assess the finan... | ['San Diego Padres', 'Major League Baseball', ... | 2020-12-31 | 42 |
16784 | Business Day | Their Finances Ravaged, Customers Fear Banks W... | Banks have the power to decide whether to let ... | ['Banking and Financial Institutions', 'Corona... | 2020-12-31 | 556 |
16785 | Food | Should Wine Be Among Your Health Resolutions? | The new category of ‘clean wines’ is an effort... | ['Wines', 'Grapes', 'Diet and Nutrition', 'Dia... | 2020-12-31 | 277 |
16786 | Technology | Microsoft Says Russian Hackers Viewed Some of ... | The hackers gained more access than the compan... | ['Microsoft Corp', 'US Federal Government Data... | 2020-12-31 | 137 |
16787 rows × 6 columns
nt_section_comments_count = nt_new.groupby(['section'])['number_of_comments'].sum().reset_index()
nt_section_count = nt_new['section'].value_counts()## kokių straipsnių daugiausia
nt_section_count = nt_section_count.reset_index().rename(columns={'section':'number_of_articles', 'index': 'section'})
section_articles_comments = pd.merge(nt_section_comments_count, nt_section_count, on=['section'])
section_articles_comments.sort_values("number_of_articles", ascending=False).reset_index()
index | section | number_of_comments | number_of_articles | |
---|---|---|---|---|
0 | 36 | U.S. | 1156457 | 2364 |
1 | 18 | Opinion | 2082114 | 2272 |
2 | 40 | World | 284874 | 1183 |
3 | 1 | Arts | 60180 | 1094 |
4 | 16 | New York | 211581 | 1055 |
5 | 6 | Business Day | 196889 | 932 |
6 | 30 | The Learning Network | 113314 | 708 |
7 | 22 | Real Estate | 46987 | 687 |
8 | 39 | Well | 81669 | 630 |
9 | 11 | Food | 58564 | 573 |
10 | 25 | Sports | 28500 | 569 |
11 | 14 | Magazine | 109963 | 471 |
12 | 8 | Crosswords & Games | 111934 | 468 |
13 | 12 | Health | 78403 | 425 |
14 | 26 | Style | 43404 | 400 |
15 | 4 | Books | 20733 | 371 |
16 | 23 | Science | 23698 | 354 |
17 | 20 | Podcasts | 7309 | 353 |
18 | 31 | The Upshot | 109273 | 338 |
19 | 29 | Technology | 49165 | 254 |
20 | 33 | Theater | 5590 | 245 |
21 | 35 | Travel | 25199 | 213 |
22 | 15 | Movies | 20243 | 173 |
23 | 7 | Climate | 26613 | 148 |
24 | 34 | Times Insider | 8336 | 115 |
25 | 19 | Parenting | 3533 | 86 |
26 | 28 | T Magazine | 2303 | 83 |
27 | 21 | Reader Center | 4685 | 78 |
28 | 10 | Fashion & Style | 2477 | 51 |
29 | 17 | Obituaries | 2763 | 44 |
30 | 24 | Smarter Living | 682 | 7 |
31 | 27 | Sunday Review | 5469 | 7 |
32 | 9 | Education | 182 | 7 |
33 | 32 | The Weekly | 205 | 6 |
34 | 37 | Universal | 109 | 5 |
35 | 2 | At Home | 210 | 5 |
36 | 41 | Your Money | 322 | 4 |
37 | 38 | Video | 1812 | 3 |
38 | 5 | Briefing | 226 | 3 |
39 | 13 | International Home | 263 | 1 |
40 | 3 | Automobiles | 29 | 1 |
41 | 0 | Admin | 199 | 1 |
sns.set_theme(style="whitegrid")
section_articles_comments = section_articles_comments.sort_values("number_of_articles", ascending=False)
f, ax = plt.subplots(figsize=(20, 15))
sns.set_color_codes("pastel")
sns.barplot(x="number_of_comments", y="section", data=section_articles_comments,
label="number_of_comments", color="b",)
sns.set_color_codes("muted")
sns.barplot(x="number_of_articles", y="section", data=section_articles_comments,
label="number_of_articles", color="b" )
#sns.barplot.sort_by('number_of_articles', ascending=False)
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 300000), ylabel="" ),
# xlabel="Komentarai ir ")
sns.despine(left=True, bottom=True)
plt.tight_layout()
nt_new_loc = pd.read_csv('/Users/indre/Downloads/archive/nyt-articles-2020.csv', index_col=1)
nt_new_loc['keywords'] = nt_new_loc['keywords'].astype(str)
keywords_list_us = nt_new_loc.loc[['U.S.'],['keywords']]
keywords_list_us = pd.Series(' '.join(keywords_list_us['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_us = keywords_list_us.rename(columns={'index':'word_us', '0': 'word_count'})
keywords_list_opinion = nt_new_loc.loc[['Opinion'],['keywords']]
keywords_list_opinion = pd.Series(' '.join(keywords_list_opinion['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_opinion = keywords_list_opinion.rename(columns={'index':'word_opinion', '0': 'word_count'})
keywords_list_world = nt_new_loc.loc[['World'],['keywords']]
keywords_list_world = pd.Series(' '.join(keywords_list_world['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_world = keywords_list_world.rename(columns={'index':'word_world', '0': 'word_count'})
keywords_list_upshot = nt_new_loc.loc[['The Upshot'],['keywords']]
keywords_list_upshot = pd.Series(' '.join(keywords_list_upshot['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_upshot = keywords_list_upshot.rename(columns={'index':'word_upshots', '0': 'word_count'})
keywords_list_magazine = nt_new_loc.loc[['Magazine'],['keywords']]
keywords_list_magazine = pd.Series(' '.join(keywords_list_magazine['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_magazine = keywords_list_magazine.rename(columns={'index':'word_magazine', '0': 'word_count'})
keywords_list_business = nt_new_loc.loc[['Business Day'],['keywords']]
keywords_list_business = pd.Series(' '.join(keywords_list_business['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_business = keywords_list_business.rename(columns={'index':'word_business', '0': 'word_count'})
keywords_list_all = nt_new_loc['keywords'].reset_index()
keywords_list_all = pd.Series(' '.join(keywords_list_all['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_all = keywords_list_all.rename(columns={'index':'word_all', '0': 'word_count'})
frequent_words_bysections = pd.concat([keywords_list_us, keywords_list_opinion,keywords_list_world, keywords_list_upshot, keywords_list_magazine,keywords_list_business, keywords_list_all], axis=1)
frequent_words_bysections
word_us | 0 | word_opinion | 0 | word_world | 0 | word_upshots | 0 | word_magazine | 0 | word_business | 0 | word_all | 0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | and | 4562 | and | 4366 | and | 1846 | and | 695 | and | 694 | and | 2279 | and | 26472 |
1 | 'united | 1522 | 'united | 1583 | (2019-ncov)', | 439 | 'coronavirus | 148 | states | 131 | 'coronavirus | 520 | 'united | 4755 |
2 | states | 1495 | states | 1407 | government', | 362 | (2019-ncov)', | 145 | 'united | 126 | (2019-ncov)', | 500 | (2019-ncov)', | 4733 |
3 | of | 1432 | of | 1125 | ['coronavirus | 361 | of | 137 | (2019-ncov)', | 104 | 'united | 417 | of | 4642 |
4 | politics | 1116 | politics | 1060 | 'politics | 326 | 'united | 126 | 'coronavirus | 81 | states | 387 | 'coronavirus | 4512 |
5 | donald | 1059 | donald | 1045 | 'united | 305 | states | 121 | of | 72 | inc', | 235 | states | 4282 |
6 | election | 986 | 'trump, | 949 | 'coronavirus | 276 | election | 113 | forces', | 59 | economy', | 209 | donald | 2866 |
7 | j', | 940 | government', | 939 | states | 238 | 2020', | 95 | 'black | 54 | of | 200 | government', | 2833 |
8 | government', | 909 | election | 830 | of | 205 | public | 71 | ['coronavirus | 54 | ['coronavirus | 186 | politics | 2762 |
9 | 2020', | 883 | j', | 821 | (2019-ncov)'] | 153 | opinion', | 64 | people', | 52 | government', | 158 | 'trump, | 2539 |
10 | 'trump, | 878 | (2019-ncov)', | 792 | military | 151 | donald | 64 | military | 51 | 'labor | 130 | j', | 2457 |
11 | party', | 713 | 2020', | 725 | international | 147 | r | 63 | war | 50 | jobs', | 123 | election | 2251 |
12 | (2019-ncov)', | 710 | 'coronavirus | 689 | forces', | 146 | 'presidential | 62 | 'blacks', | 49 | politics | 114 | ['coronavirus | 2012 |
13 | 'presidential | 674 | party', | 660 | relations', | 146 | joseph | 62 | politics | 46 | conditions | 111 | 2020', | 1980 |
14 | 'coronavirus | 622 | 'presidential | 529 | )', | 141 | 'polls | 61 | defense | 44 | economic | 105 | )', | 1846 |
15 | (us)', | 572 | (us)', | 481 | donald | 130 | 'trump, | 61 | government', | 41 | (2020)', | 104 | york | 1683 |
16 | jr', | 557 | jr', | 409 | 'china', | 119 | economy', | 61 | )', | 39 | 'economic | 104 | 'new | 1631 |
17 | r | 539 | r | 391 | 'trump, | 117 | ['coronavirus | 59 | election | 36 | act | 104 | party', | 1530 |
18 | joseph | 536 | 'democratic | 389 | 'quarantines', | 116 | 'biden, | 58 | ethnicity', | 34 | (us)', | 102 | 'presidential | 1512 |
19 | 'democratic | 484 | joseph | 389 | j', | 112 | politics | 55 | donald | 33 | & | 102 | (us)', | 1382 |
20 | 'biden, | 453 | 'republican | 371 | protests | 102 | ['presidential | 52 | 2020', | 30 | security | 102 | the | 1303 |
21 | 'republican | 368 | 'biden, | 352 | rights | 100 | jr', | 51 | ii | 30 | bonds', | 98 | jr', | 1251 |
22 | ['presidential | 362 | ['coronavirus | 346 | 'deaths | 94 | care', | 49 | (1939-45)', | 29 | media', | 98 | r | 1196 |
23 | ['coronavirus | 351 | ['presidential | 320 | 'defense | 94 | insurance | 46 | protests | 28 | donald | 97 | joseph | 1187 |
24 | )', | 338 | j'] | 253 | 'epidemics', | 91 | act | 45 | 'trump, | 27 | 'shutdowns | 97 | inc', | 1184 |
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.figure(figsize=(13,8))
searchstr = 'TIESIOG GRAŽI LENTELĖ'
chars = list(set(searchstr))
chars.sort()
for char in chars:
indices = [i + 1 for i in range(len(searchstr)) if searchstr[i] == char]
plt.plot(indices, [ord(char)] * len(indices), '*')
for index, ch in enumerate(searchstr):
plt.text(index, 55, ch, family='monospace')
plt.xlim(0, len(searchstr))
plt.ylim(ord(' ') - 1, ord('Z') + 1)
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.set_minor_locator(ticker.MultipleLocator(1))
plt.show()
from matplotlib_venn import venn2
plt.figure(figsize=(12,9))
set1 = set(frequent_words_bysections['word_us'])
set2 = set(frequent_words_bysections['word_business'])
plt.title('U.S. ir The Business sekcijų keywords palyginimas')
venn = venn2([set1, set2])
venn.get_label_by_id('100').set_text('\n'.join(map(str,set1-set2)))
venn.get_label_by_id('110').set_text('\n'.join(map(str,set1&set2)))
venn.get_label_by_id('010').set_text('\n'.join(map(str,set2-set1)))
# venn.get_label is quoted from https://stackoverflow.com/questions/55717203/plot-actual-set-items-in-python-not-the-number-of-items
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
df = frequent_words_bysections['word_all']
counts = df.value_counts()
counts.index = counts.index.map(str)
wordcloud = WordCloud().generate_from_frequencies(counts)
plt.figure(figsize=(12,9))
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate_from_frequencies(counts)
# stopwords = set(STOPWORDS)
# stopwords.update(["corona", "trump", "2020",])
# wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate_from_frequencies(counts)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
from sklearn.linear_model import LinearRegression
section_articles_comments
section | number_of_comments | number_of_articles | |
---|---|---|---|
36 | U.S. | 1156457 | 2364 |
18 | Opinion | 2082114 | 2272 |
40 | World | 284874 | 1183 |
1 | Arts | 60180 | 1094 |
16 | New York | 211581 | 1055 |
6 | Business Day | 196889 | 932 |
30 | The Learning Network | 113314 | 708 |
22 | Real Estate | 46987 | 687 |
39 | Well | 81669 | 630 |
11 | Food | 58564 | 573 |
25 | Sports | 28500 | 569 |
14 | Magazine | 109963 | 471 |
8 | Crosswords & Games | 111934 | 468 |
12 | Health | 78403 | 425 |
26 | Style | 43404 | 400 |
4 | Books | 20733 | 371 |
23 | Science | 23698 | 354 |
20 | Podcasts | 7309 | 353 |
31 | The Upshot | 109273 | 338 |
29 | Technology | 49165 | 254 |
33 | Theater | 5590 | 245 |
35 | Travel | 25199 | 213 |
15 | Movies | 20243 | 173 |
7 | Climate | 26613 | 148 |
34 | Times Insider | 8336 | 115 |
19 | Parenting | 3533 | 86 |
28 | T Magazine | 2303 | 83 |
21 | Reader Center | 4685 | 78 |
10 | Fashion & Style | 2477 | 51 |
17 | Obituaries | 2763 | 44 |
24 | Smarter Living | 682 | 7 |
27 | Sunday Review | 5469 | 7 |
9 | Education | 182 | 7 |
32 | The Weekly | 205 | 6 |
37 | Universal | 109 | 5 |
2 | At Home | 210 | 5 |
41 | Your Money | 322 | 4 |
38 | Video | 1812 | 3 |
5 | Briefing | 226 | 3 |
13 | International Home | 263 | 1 |
3 | Automobiles | 29 | 1 |
0 | Admin | 199 | 1 |
X= section_articles_comments[['number_of_articles']]
y= section_articles_comments['number_of_comments']
model = LinearRegression()
model.fit(X, y)
LinearRegression()
prediction = model.predict(X).round()
section_articles_comments['Prediction'] = prediction
section_articles_comments
section | number_of_comments | number_of_articles | Prediction | |
---|---|---|---|---|
36 | U.S. | 1156457 | 2364 | 1204325.0 |
18 | Opinion | 2082114 | 2272 | 1153480.0 |
40 | World | 284874 | 1183 | 551631.0 |
1 | Arts | 60180 | 1094 | 502444.0 |
16 | New York | 211581 | 1055 | 480890.0 |
6 | Business Day | 196889 | 932 | 412913.0 |
30 | The Learning Network | 113314 | 708 | 289116.0 |
22 | Real Estate | 46987 | 687 | 277510.0 |
39 | Well | 81669 | 630 | 246009.0 |
11 | Food | 58564 | 573 | 214507.0 |
25 | Sports | 28500 | 569 | 212296.0 |
14 | Magazine | 109963 | 471 | 158135.0 |
8 | Crosswords & Games | 111934 | 468 | 156477.0 |
12 | Health | 78403 | 425 | 132713.0 |
26 | Style | 43404 | 400 | 118896.0 |
4 | Books | 20733 | 371 | 102869.0 |
23 | Science | 23698 | 354 | 93474.0 |
20 | Podcasts | 7309 | 353 | 92921.0 |
31 | The Upshot | 109273 | 338 | 84631.0 |
29 | Technology | 49165 | 254 | 38208.0 |
33 | Theater | 5590 | 245 | 33234.0 |
35 | Travel | 25199 | 213 | 15548.0 |
15 | Movies | 20243 | 173 | -6558.0 |
7 | Climate | 26613 | 148 | -20375.0 |
34 | Times Insider | 8336 | 115 | -38612.0 |
19 | Parenting | 3533 | 86 | -54640.0 |
28 | T Magazine | 2303 | 83 | -56298.0 |
21 | Reader Center | 4685 | 78 | -59061.0 |
10 | Fashion & Style | 2477 | 51 | -73983.0 |
17 | Obituaries | 2763 | 44 | -77851.0 |
24 | Smarter Living | 682 | 7 | -98300.0 |
27 | Sunday Review | 5469 | 7 | -98300.0 |
9 | Education | 182 | 7 | -98300.0 |
32 | The Weekly | 205 | 6 | -98853.0 |
37 | Universal | 109 | 5 | -99405.0 |
2 | At Home | 210 | 5 | -99405.0 |
41 | Your Money | 322 | 4 | -99958.0 |
38 | Video | 1812 | 3 | -100511.0 |
5 | Briefing | 226 | 3 | -100511.0 |
13 | International Home | 263 | 1 | -101616.0 |
3 | Automobiles | 29 | 1 | -101616.0 |
0 | Admin | 199 | 1 | -101616.0 |
model.predict([[200]])
array([8363.86966737])
r_squared = model.score(X, y)
r_squared
0.6989108894456372
plt.scatter(X,y)
plt.plot(X, prediction, 'r', linestyle='-.')
plt.tight_layout()
plt.title('Koreliacija tarp straipsnių skaičiaus ir komentarų skaičiaus')
Text(0.5, 1.0, 'Koreliacija tarp straipsnių skaičiaus ir komentarų skaičiaus')
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.figure(figsize=(13,8))
searchstr = 'BET LENTELĖ GRAŽI, NESAKYKIT'
chars = list(set(searchstr))
chars.sort()
for char in chars:
indices = [i + 1 for i in range(len(searchstr)) if searchstr[i] == char]
plt.plot(indices, [ord(char)] * len(indices), '*')
for index, ch in enumerate(searchstr):
plt.text(index, 55, ch, family='monospace')
plt.xlim(0, len(searchstr))
plt.ylim(ord(' ') - 1, ord('Z') + 1)
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.set_minor_locator(ticker.MultipleLocator(1))
plt.show()