# import mysql.connector

# mydb = mysql.connector.connect(
#    host="localhost",
#    port=3306,
#    user="root",
#    password="password"
# )
# cursor = mydb.cursor()
# mycursor.execute('CREAT DATABASE Baigiamojo')
# cursor.execute("SHOW DATABASES")

# for i in cursor:
#    print(i)


import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re


nt = pd.read_csv('/Users/indre/Downloads/archive/nyt-articles-2020.csv')


nt = nt.rename(columns={'n_comments':'number_of_comments'})


nt['pub_date'] = pd.to_datetime(nt['pub_date']).dt.date


 nt_new = nt[[ 'section', 'headline', 'abstract', 'keywords', 'pub_date', 'number_of_comments']]
 nt_new


nt_section_comments_count = nt_new.groupby(['section'])['number_of_comments'].sum().reset_index()


nt_section_count = nt_new['section'].value_counts()## kokių straipsnių daugiausia 
nt_section_count = nt_section_count.reset_index().rename(columns={'section':'number_of_articles', 'index': 'section'})


section_articles_comments = pd.merge(nt_section_comments_count, nt_section_count, on=['section'])
section_articles_comments.sort_values("number_of_articles", ascending=False).reset_index()


sns.set_theme(style="whitegrid")
section_articles_comments = section_articles_comments.sort_values("number_of_articles", ascending=False)



f, ax = plt.subplots(figsize=(20, 15))


sns.set_color_codes("pastel")
sns.barplot(x="number_of_comments", y="section", data=section_articles_comments,
            label="number_of_comments", color="b",)

sns.set_color_codes("muted")
sns.barplot(x="number_of_articles", y="section", data=section_articles_comments,
            label="number_of_articles", color="b" )

#sns.barplot.sort_by('number_of_articles', ascending=False)

ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 300000), ylabel="" ),
      # xlabel="Komentarai ir ")
sns.despine(left=True, bottom=True)
plt.tight_layout()


nt_new_loc = pd.read_csv('/Users/indre/Downloads/archive/nyt-articles-2020.csv', index_col=1)


nt_new_loc['keywords'] = nt_new_loc['keywords'].astype(str)


keywords_list_us = nt_new_loc.loc[['U.S.'],['keywords']]
keywords_list_us = pd.Series(' '.join(keywords_list_us['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_us = keywords_list_us.rename(columns={'index':'word_us', '0': 'word_count'})


keywords_list_opinion = nt_new_loc.loc[['Opinion'],['keywords']]
keywords_list_opinion = pd.Series(' '.join(keywords_list_opinion['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_opinion = keywords_list_opinion.rename(columns={'index':'word_opinion', '0': 'word_count'})


keywords_list_world = nt_new_loc.loc[['World'],['keywords']]
keywords_list_world = pd.Series(' '.join(keywords_list_world['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_world = keywords_list_world.rename(columns={'index':'word_world', '0': 'word_count'})


keywords_list_upshot = nt_new_loc.loc[['The Upshot'],['keywords']]
keywords_list_upshot = pd.Series(' '.join(keywords_list_upshot['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_upshot = keywords_list_upshot.rename(columns={'index':'word_upshots', '0': 'word_count'})


keywords_list_magazine = nt_new_loc.loc[['Magazine'],['keywords']]
keywords_list_magazine = pd.Series(' '.join(keywords_list_magazine['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_magazine = keywords_list_magazine.rename(columns={'index':'word_magazine', '0': 'word_count'})


keywords_list_business = nt_new_loc.loc[['Business Day'],['keywords']]
keywords_list_business = pd.Series(' '.join(keywords_list_business['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_business = keywords_list_business.rename(columns={'index':'word_business', '0': 'word_count'})


keywords_list_all  = nt_new_loc['keywords'].reset_index()
keywords_list_all = pd.Series(' '.join(keywords_list_all['keywords']).lower().split()).value_counts()[:100].reset_index().head(25)
keywords_list_all = keywords_list_all.rename(columns={'index':'word_all', '0': 'word_count'})


frequent_words_bysections = pd.concat([keywords_list_us, keywords_list_opinion,keywords_list_world, keywords_list_upshot, keywords_list_magazine,keywords_list_business, keywords_list_all], axis=1)
frequent_words_bysections


import matplotlib.pyplot as plt

import matplotlib.ticker as ticker
plt.figure(figsize=(13,8))

searchstr = 'TIESIOG GRAŽI LENTELĖ'

chars = list(set(searchstr))
chars.sort()
for char in chars:
    indices = [i + 1 for i in range(len(searchstr)) if searchstr[i] == char]
    plt.plot(indices, [ord(char)] * len(indices), '*')

for index, ch in enumerate(searchstr):
    plt.text(index, 55, ch, family='monospace')

plt.xlim(0, len(searchstr))
plt.ylim(ord(' ') - 1, ord('Z') + 1)
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.set_minor_locator(ticker.MultipleLocator(1))
plt.show()


from matplotlib_venn import venn2
plt.figure(figsize=(12,9))
set1 = set(frequent_words_bysections['word_us'])
set2 = set(frequent_words_bysections['word_business'])

plt.title('U.S. ir The Business sekcijų keywords palyginimas')

venn = venn2([set1, set2])
venn.get_label_by_id('100').set_text('\n'.join(map(str,set1-set2)))
venn.get_label_by_id('110').set_text('\n'.join(map(str,set1&set2)))
venn.get_label_by_id('010').set_text('\n'.join(map(str,set2-set1)))
# venn.get_label is quoted from https://stackoverflow.com/questions/55717203/plot-actual-set-items-in-python-not-the-number-of-items


import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

df = frequent_words_bysections['word_all']
counts = df.value_counts()
counts.index = counts.index.map(str)
wordcloud = WordCloud().generate_from_frequencies(counts)
plt.figure(figsize=(12,9))
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate_from_frequencies(counts)

# stopwords = set(STOPWORDS)
# stopwords.update(["corona", "trump", "2020",])
# wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate_from_frequencies(counts)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


from sklearn.linear_model import  LinearRegression


section_articles_comments


X= section_articles_comments[['number_of_articles']]


y= section_articles_comments['number_of_comments']


model = LinearRegression()


model.fit(X, y)

LinearRegression()


prediction = model.predict(X).round()


section_articles_comments['Prediction'] = prediction 
section_articles_comments


model.predict([[200]])

array([8363.86966737])


r_squared = model.score(X, y) 
r_squared

0.6989108894456372


plt.scatter(X,y)
plt.plot(X, prediction, 'r', linestyle='-.')
plt.tight_layout()
plt.title('Koreliacija tarp straipsnių skaičiaus ir komentarų skaičiaus')

Text(0.5, 1.0, 'Koreliacija tarp straipsnių skaičiaus ir komentarų skaičiaus')


import matplotlib.pyplot as plt

import matplotlib.ticker as ticker
plt.figure(figsize=(13,8))

searchstr = 'BET LENTELĖ GRAŽI, NESAKYKIT'

chars = list(set(searchstr))
chars.sort()
for char in chars:
    indices = [i + 1 for i in range(len(searchstr)) if searchstr[i] == char]
    plt.plot(indices, [ord(char)] * len(indices), '*')

for index, ch in enumerate(searchstr):
    plt.text(index, 55, ch, family='monospace')

plt.xlim(0, len(searchstr))
plt.ylim(ord(' ') - 1, ord('Z') + 1)
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.grid(True, 'minor')
plt.gca().xaxis.set_minor_locator(ticker.MultipleLocator(1))
plt.show()

	section	headline	abstract	keywords	pub_date	number_of_comments
0	Opinion	Protect Veterans From Fraud	Congress could do much more to protect America...	['Veterans', 'For-Profit Schools', 'Financial ...	2020-01-01	186
1	Crosswords & Games	‘It’s Green and Slimy’	Christina Iverson and Jeff Chen ring in the Ne...	['Crossword Puzzles']	2020-01-01	257
2	Science	Meteor Showers in 2020 That Will Light Up Nigh...	All year long, Earth passes through streams of...	['Meteors and Meteorites', 'Space and Astronom...	2020-01-01	6
3	Science	Sync your calendar with the solar system	Never miss an eclipse, a meteor shower, a rock...	['Space and Astronomy', 'Moon', 'Eclipses', 'S...	2020-01-01	2
4	Science	Rocket Launches, Trips to Mars and More 2020 S...	A year full of highs and lows in space just en...	['Space and Astronomy', 'Private Spaceflight',...	2020-01-01	25
...	...	...	...	...	...	...
16782	Opinion	What It Takes to Heal From Covid-19	Survivors can get better, but they need help.	['Chronic Condition (Health)', 'Coronavirus (2...	2020-12-31	166
16783	Sports	Padres Jolt M.L.B. With Bold Moves to Set Up W...	While many teams continued to assess the finan...	['San Diego Padres', 'Major League Baseball', ...	2020-12-31	42
16784	Business Day	Their Finances Ravaged, Customers Fear Banks W...	Banks have the power to decide whether to let ...	['Banking and Financial Institutions', 'Corona...	2020-12-31	556
16785	Food	Should Wine Be Among Your Health Resolutions?	The new category of ‘clean wines’ is an effort...	['Wines', 'Grapes', 'Diet and Nutrition', 'Dia...	2020-12-31	277
16786	Technology	Microsoft Says Russian Hackers Viewed Some of ...	The hackers gained more access than the compan...	['Microsoft Corp', 'US Federal Government Data...	2020-12-31	137

	index	section	number_of_comments	number_of_articles
0	36	U.S.	1156457	2364
1	18	Opinion	2082114	2272
2	40	World	284874	1183
3	1	Arts	60180	1094
4	16	New York	211581	1055
5	6	Business Day	196889	932
6	30	The Learning Network	113314	708
7	22	Real Estate	46987	687
8	39	Well	81669	630
9	11	Food	58564	573
10	25	Sports	28500	569
11	14	Magazine	109963	471
12	8	Crosswords & Games	111934	468
13	12	Health	78403	425
14	26	Style	43404	400
15	4	Books	20733	371
16	23	Science	23698	354
17	20	Podcasts	7309	353
18	31	The Upshot	109273	338
19	29	Technology	49165	254
20	33	Theater	5590	245
21	35	Travel	25199	213
22	15	Movies	20243	173
23	7	Climate	26613	148
24	34	Times Insider	8336	115
25	19	Parenting	3533	86
26	28	T Magazine	2303	83
27	21	Reader Center	4685	78
28	10	Fashion & Style	2477	51
29	17	Obituaries	2763	44
30	24	Smarter Living	682	7
31	27	Sunday Review	5469	7
32	9	Education	182	7
33	32	The Weekly	205	6
34	37	Universal	109	5
35	2	At Home	210	5
36	41	Your Money	322	4
37	38	Video	1812	3
38	5	Briefing	226	3
39	13	International Home	263	1
40	3	Automobiles	29	1
41	0	Admin	199	1

	word_us	0	word_opinion	0	word_world	0	word_upshots	0	word_magazine	0	word_business	0	word_all	0
0	and	4562	and	4366	and	1846	and	695	and	694	and	2279	and	26472
1	'united	1522	'united	1583	(2019-ncov)',	439	'coronavirus	148	states	131	'coronavirus	520	'united	4755
2	states	1495	states	1407	government',	362	(2019-ncov)',	145	'united	126	(2019-ncov)',	500	(2019-ncov)',	4733
3	of	1432	of	1125	['coronavirus	361	of	137	(2019-ncov)',	104	'united	417	of	4642
4	politics	1116	politics	1060	'politics	326	'united	126	'coronavirus	81	states	387	'coronavirus	4512
5	donald	1059	donald	1045	'united	305	states	121	of	72	inc',	235	states	4282
6	election	986	'trump,	949	'coronavirus	276	election	113	forces',	59	economy',	209	donald	2866
7	j',	940	government',	939	states	238	2020',	95	'black	54	of	200	government',	2833
8	government',	909	election	830	of	205	public	71	['coronavirus	54	['coronavirus	186	politics	2762
9	2020',	883	j',	821	(2019-ncov)']	153	opinion',	64	people',	52	government',	158	'trump,	2539
10	'trump,	878	(2019-ncov)',	792	military	151	donald	64	military	51	'labor	130	j',	2457
11	party',	713	2020',	725	international	147	r	63	war	50	jobs',	123	election	2251
12	(2019-ncov)',	710	'coronavirus	689	forces',	146	'presidential	62	'blacks',	49	politics	114	['coronavirus	2012
13	'presidential	674	party',	660	relations',	146	joseph	62	politics	46	conditions	111	2020',	1980
14	'coronavirus	622	'presidential	529	)',	141	'polls	61	defense	44	economic	105	)',	1846
15	(us)',	572	(us)',	481	donald	130	'trump,	61	government',	41	(2020)',	104	york	1683
16	jr',	557	jr',	409	'china',	119	economy',	61	)',	39	'economic	104	'new	1631
17	r	539	r	391	'trump,	117	['coronavirus	59	election	36	act	104	party',	1530
18	joseph	536	'democratic	389	'quarantines',	116	'biden,	58	ethnicity',	34	(us)',	102	'presidential	1512
19	'democratic	484	joseph	389	j',	112	politics	55	donald	33	&	102	(us)',	1382
20	'biden,	453	'republican	371	protests	102	['presidential	52	2020',	30	security	102	the	1303
21	'republican	368	'biden,	352	rights	100	jr',	51	ii	30	bonds',	98	jr',	1251
22	['presidential	362	['coronavirus	346	'deaths	94	care',	49	(1939-45)',	29	media',	98	r	1196
23	['coronavirus	351	['presidential	320	'defense	94	insurance	46	protests	28	donald	97	joseph	1187
24	)',	338	j']	253	'epidemics',	91	act	45	'trump,	27	'shutdowns	97	inc',	1184

	section	number_of_comments	number_of_articles
36	U.S.	1156457	2364
18	Opinion	2082114	2272
40	World	284874	1183
1	Arts	60180	1094
16	New York	211581	1055
6	Business Day	196889	932
30	The Learning Network	113314	708
22	Real Estate	46987	687
39	Well	81669	630
11	Food	58564	573
25	Sports	28500	569
14	Magazine	109963	471
8	Crosswords & Games	111934	468
12	Health	78403	425
26	Style	43404	400
4	Books	20733	371
23	Science	23698	354
20	Podcasts	7309	353
31	The Upshot	109273	338
29	Technology	49165	254
33	Theater	5590	245
35	Travel	25199	213
15	Movies	20243	173
7	Climate	26613	148
34	Times Insider	8336	115
19	Parenting	3533	86
28	T Magazine	2303	83
21	Reader Center	4685	78
10	Fashion & Style	2477	51
17	Obituaries	2763	44
24	Smarter Living	682	7
27	Sunday Review	5469	7
9	Education	182	7
32	The Weekly	205	6
37	Universal	109	5
2	At Home	210	5
41	Your Money	322	4
38	Video	1812	3
5	Briefing	226	3
13	International Home	263	1
3	Automobiles	29	1
0	Admin	199	1

	section	number_of_comments	number_of_articles	Prediction
36	U.S.	1156457	2364	1204325.0
18	Opinion	2082114	2272	1153480.0
40	World	284874	1183	551631.0
1	Arts	60180	1094	502444.0
16	New York	211581	1055	480890.0
6	Business Day	196889	932	412913.0
30	The Learning Network	113314	708	289116.0
22	Real Estate	46987	687	277510.0
39	Well	81669	630	246009.0
11	Food	58564	573	214507.0
25	Sports	28500	569	212296.0
14	Magazine	109963	471	158135.0
8	Crosswords & Games	111934	468	156477.0
12	Health	78403	425	132713.0
26	Style	43404	400	118896.0
4	Books	20733	371	102869.0
23	Science	23698	354	93474.0
20	Podcasts	7309	353	92921.0
31	The Upshot	109273	338	84631.0
29	Technology	49165	254	38208.0
33	Theater	5590	245	33234.0
35	Travel	25199	213	15548.0
15	Movies	20243	173	-6558.0
7	Climate	26613	148	-20375.0
34	Times Insider	8336	115	-38612.0
19	Parenting	3533	86	-54640.0
28	T Magazine	2303	83	-56298.0
21	Reader Center	4685	78	-59061.0
10	Fashion & Style	2477	51	-73983.0
17	Obituaries	2763	44	-77851.0
24	Smarter Living	682	7	-98300.0
27	Sunday Review	5469	7	-98300.0
9	Education	182	7	-98300.0
32	The Weekly	205	6	-98853.0
37	Universal	109	5	-99405.0
2	At Home	210	5	-99405.0
41	Your Money	322	4	-99958.0
38	Video	1812	3	-100511.0
5	Briefing	226	3	-100511.0
13	International Home	263	1	-101616.0
3	Automobiles	29	1	-101616.0
0	Admin	199	1	-101616.0

Kokie straipsniai sulaukia daugiausia komentarų? <Galima įsivertinti kokių straipsnių reikėtų daugiau, kadangi pastarieji stimuliuoja įsitraukimą>¶

Populiariausi raktažodžiai ['keywords'] didžiausiuose ir sąlyginai mažuose sections (U.S., Opinion, World, Upshot, Magazine, Business)¶

https://github.com/JasonKessler/scattertext#tutorial ¶

Ar strapsnių skaičius koreliuoja su komentarų skaičiumi?¶

Išvada – koreliacija silpna, ant strapsnių skaičiaus - pinigų nestatyčiau :) Kad nuspėti reikėtų analizuoti su section arba keywords :)¶

Kokie straipsniai sulaukia daugiausia komentarų? <Galima įsivertinti kokių straipsnių reikėtų daugiau, kadangi pastarieji stimuliuoja įsitraukimą>¶

Populiariausi raktažodžiai ['keywords'] didžiausiuose ir sąlyginai mažuose sections (U.S., Opinion, World, Upshot, Magazine, Business)¶

https://github.com/JasonKessler/scattertext#tutorial¶

Ar strapsnių skaičius koreliuoja su komentarų skaičiumi?¶

Išvada – koreliacija silpna, ant strapsnių skaičiaus - pinigų nestatyčiau :) Kad nuspėti reikėtų analizuoti su section arba keywords :)¶

https://github.com/JasonKessler/scattertext#tutorial ¶