{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Read Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", " \n", "#this assumes one json item per line in json file\n", "df=pd.read_json(\"../data/news_category_dataset.json\", lines=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "authors object\n", "category object\n", "date datetime64[ns]\n", "headline object\n", "link object\n", "short_description object\n", "dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "124989" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#number of rows (datapoints)\n", "len(df)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
authorscategorydateheadlinelinkshort_description
2317Chris McGonigalWORLD NEWS2018-04-12Striking Photos Show Israelis Standing Still F...https://www.huffingtonpost.com/entry/israelis-...Traffic stopped for two minutes.
20810Bill Bradley and David MoyeENTERTAINMENT2017-07-23'Westworld' Season 2 Teaser Suggests Violent D...https://www.huffingtonpost.com/entry/westworld...This time, the robots are in charge?
56479Eliot NelsonPOLITICS2016-06-10Donald Trump Tells Religious Conservatives He'...https://www.huffingtonpost.com/entry/donald-tr...His somber address was a hit at the Freedom Co...
39517Jenna AmatulliARTS & CULTURE2016-12-19Sadly, 'Puppy' Isn't Merriam-Webster's Word Of...https://www.huffingtonpost.com/entry/sadly-pup...“Surreal\" won out over \"puppy,” “flummadiddle,...
18634Robert Koehler, ContributorPeace journalistWORLD NEWS2017-08-17Why Does North Korea Hate Us?https://www.huffingtonpost.com/entry/why-does-...“The bombing was long, leisurely and merciless...
18900East-West Center, ContributorPromoting better ...WOMEN2017-08-15Asian Teams Participate In FIRST Global Roboti...https://www.huffingtonpost.com/entry/asian-tea...By Xinxin Zhang, Research Intern, East-West Ce...
88345Sebastian MurdockWEIRD NEWS2015-06-14This Is Not How You Play Frisbee, But We Love ...https://www.huffingtonpost.com/entry/bosnian-f...
62054James CaveSTYLE2016-04-07The Real Story On How Trench Coats Got Their Namehttps://www.huffingtonpost.com/entry/why-are-t...Bet you never knew how Burberry struck gold.
66784SPORTS2016-02-12New York Mets Pitcher Jenrry Mejia Permanently...https://www.huffingtonpost.comhttp://pubx.co/h...New York Mets relief pitcher Jenrry Mejia has ...
73038SPORTS2015-12-03New Wave of Arrests in FIFA Corruption Scandalhttps://www.huffingtonpost.comhttp://www.nytim...At least some of the arrests took place at the...
52487Michael McLaughlinPOLITICS2016-07-25California City Bans Deceptive Ads By Anti-Abo...https://www.huffingtonpost.com/entry/oakland-b...Oakland's law targets pregnancy clinics that o...
111907Bill BradleyENTERTAINMENT2014-09-15Daryl Dixon's Big Secret Is Finally Revealedhttps://www.huffingtonpost.com/entry/walking-d...
64576Zach Carter and Shahien NasiripourPOLITICS2016-03-08Elizabeth Warren Is Not On The Ballot And Her ...https://www.huffingtonpost.com/entry/elizabeth...The progressive movement faces its biggest test.
39143Nick VisserHEALTHY LIVING2016-12-23Scientists Create Effective Ebola Vaccine, Jus...https://www.huffingtonpost.com/entry/ebola-vac...\"When the next Ebola outbreak hits, we will no...
87400Beth Weissenberger, ContributorHEALTHY LIVING2015-06-25The Soul's Ingredients: The Secret to Summonin...https://www.huffingtonpost.com/entry/the-souls...
32935Rebecca ShapiroCOMEDY2017-03-02Seth Meyers Chews Out The Media For Gushing Ov...https://www.huffingtonpost.com/entry/seth-meye...\"Guys, seriously, do you have amnesia?\"
48774Lee MoranCOMEDY2016-09-05John Oliver Lists The Habits We Should Actuall...https://www.huffingtonpost.com/entry/john-oliv...Temporarily not wearing white isn't enough for...
17443Dave JamiesonPOLITICS2017-09-01Labor Lawyers Blast Trump Administration For P...https://www.huffingtonpost.com/entry/hundreds-...\"To place him next to the brave men and women ...
114621Andy CampbellWEIRD NEWS2014-08-16Wrong-Way Driver Hits Cyclists, Hides Meth In ...https://www.huffingtonpost.com/entry/wrongway-...
87370Michael Ernest Sweet, ContributorCanadian Writ...QUEER VOICES2015-06-25The Last Boys: A Book Of Photographs By Barry ...https://www.huffingtonpost.com/entry/the-last-...
119406Sara Eckel, ContributorAuthor, It’s Not You: 2...WOMEN2014-06-23Are You Hot or Not? The Answer May Change With...https://www.huffingtonpost.com/entry/are-you-h...We all venture into the dating world hoping th...
117313Lisa Copeland, ContributorDating Coach For Wom...FIFTY2014-07-16Top 20 Dating Tips For Finding Love Again Afte...https://www.huffingtonpost.com/entry/dating-ov...1. Put in writing what type of relationship yo...
95992Janell Burley Hofmann, ContributorAuthor, Spea...PARENTS2015-03-185 Tiny Stories From SxSWhttps://www.huffingtonpost.com/entry/5-tiny-st...You told the panel that they changed your life...
3400Mary PapenfussENTERTAINMENT2018-03-23Bill Murray Compares Parkland Teens To Vietnam...https://www.huffingtonpost.com/entry/parkland-...When your idealism isn't \"broken yet,\" you spe...
96289Ron DickerSCIENCE2015-03-14You Might As Well Flip A Coin To Fill In Your ...https://www.huffingtonpost.com/entry/flip-coin...
91428China Hands, ContributorFor Future Leaders in ...POLITICS2015-05-10Beyond the Gaokao: How Chinese Students Earn T...https://www.huffingtonpost.com/entry/beyond-th...Qi recounts three stories of students making t...
29215Mary PapenfussSPORTS2017-04-15NFL's Todd Heap Runs Over Daughter In Deadly A...https://www.huffingtonpost.com/entry/todd-heap...The tragedy occurred in driveway of his Arizon...
49611Michael McLaughlinPOLITICS2016-08-26Trump And Clinton Supporters Find Common Groun...https://www.huffingtonpost.com/entry/trump-and...But on other gun policy questions, the gaps ke...
53475Leigh BlickleyENTERTAINMENT2016-07-14'Game Of Thrones' Stars React To Their Emmy No...https://www.huffingtonpost.com/entry/game-of-t...So many (23, to be exact) noms for this amazin...
79857James Michael NicholsQUEER VOICES2015-09-18Seasonal Queer Coming-Of-Consciousness Party '...https://www.huffingtonpost.com/entry/psychic-s...\"We don’t impose rules on people -- the vibe i...
.....................
72907CRIME2015-12-04California Rampage Shocks Those Who Knew Shootershttps://www.huffingtonpost.com/entry/san-berna...\"This was a person who was successful, who had...
102621Nick VisserGREEN2014-12-31The 10 Best Photos From The Dept. Of The Inter...https://www.huffingtonpost.com/entry/departmen...
108212Cynthia Dagnal-Myron, ContributorReporter, Aut...ENTERTAINMENT2014-10-28Jack Bruce: A Fond Fan Farewellhttps://www.huffingtonpost.com/entry/jack-bruc...Clapton was God to some--or so the graffiti sa...
94707Kevin Price, ContributorPublisher and Editor i...POLITICS2015-04-02Federal Laws: Too Numerous and Vaguehttps://www.huffingtonpost.com/entry/federal-l...The vast majority of Americans clearly seem un...
54325Sarah Guerrero, ContributorWriter & Stay at Ho...WOMEN2016-07-04I Had An Early Miscarriagehttps://www.huffingtonpost.com/entry/i-had-an-...Grief sneaks out during tired moments.
21837Cavan SieczkowskiENTERTAINMENT2017-07-11Ryan Reynolds Gave The Best Nod To The Glory O...https://www.huffingtonpost.com/entry/ryan-reyn...Bow down.
102087Mark DeCarlo, Contributor3 time Emmy Award win...HEALTHY LIVING2015-01-06Michael Symon: Cleveland's Real Kinghttps://www.huffingtonpost.com/entry/michael-s...
101670RELIGION2015-01-11Police Chief To Black Churches: 'We Can't Do T...https://www.huffingtonpost.com/entry/police-bl...
44538Nina GolgowskiLATINO VOICES2016-10-23Eric Trump Poses With Woman Wearing 'Latina Ag...https://www.huffingtonpost.com/entry/eric-trum...\"It stands as a testament to the lack of diver...
50770Cole DelbyckENTERTAINMENT2016-08-13Kenny Baker, The Actor Who Played R2-D2 In 'St...https://www.huffingtonpost.com/entry/kenny-bak...Sad beep.
50828Cristian FariasPOLITICS2016-08-12Federal Judges Can't Clear Someone's Record, E...https://www.huffingtonpost.com/entry/federal-j...A court said they are without authority to wip...
109132Ayala Laufer-Cahana, M.D., ContributorPhysicia...HEALTHY LIVING2014-10-17Brain Zapping for Weight Losshttps://www.huffingtonpost.com/entry/brain-zap...
65178Daniel MaransPOLITICS2016-03-02Mainstream Republicans Are Unsure How To Stop ...https://www.huffingtonpost.com/entry/republica...Others are resigned to his victory -- and are ...
17870Emma GrayWOMEN2017-08-28Trump Confuses Two Female Finnish Journalists ...https://www.huffingtonpost.com/entry/trump-con...The Finnish president had to explain that they...
50296Natalie Jackson and Ariel Edwards-LevyPOLITICS2016-08-18HUFFPOLLSTER: Without Donald Trump, Republican...https://www.huffingtonpost.com/entry/without-d...The Republican nominee should be polling bette...
2924Andy McDonaldENTERTAINMENT2018-04-02'Broad City' Co-Creator Developing 'League Of ...https://www.huffingtonpost.com/entry/league-of...The project reportedly received the blessing o...
80415Vishavjit Singh, ContributorEditorial cartoonistRELIGION2015-09-12The Sikh Boy Who Developed Breasts And Grew Up...https://www.huffingtonpost.com/entry/the-sikh-...In my early childhood photos I look like a hea...
16535Michael Tesler, ContributorAssociate Professor...POLITICS2017-09-14Jemele Hill’s The Mainstream: Most Americans T...https://www.huffingtonpost.com/entry/jemele-hi...Jemele Hill, the ESPN host of SportsCenter’s “...
93889Ron DickerLATINO VOICES2015-04-11Zoe Saldana Opens Up On Facebook About Post-Bi...https://www.huffingtonpost.com/entry/zoe-salda...
119723Eric J. Hall, ContributorPresident & CEO of He...HEALTHY LIVING2014-06-19Many Doctors Don't Support Life Support When ...https://www.huffingtonpost.com/entry/death-and...A recent study of more than a thousand doctors...
3827Curtis M. WongQUEER VOICES2018-03-16Newspaper Scraps References To Gay Man's Husba...https://www.huffingtonpost.com/entry/texas-new...The publisher of Texas' Olton Enterprise said ...
14693Alexander C. KaufmanPOLITICS2017-10-07Pittsburgh’s Mayor Calls For ‘An American Mars...https://www.huffingtonpost.com/entry/peduto-tr...This could be the key for Democrats to win bac...
58342Maddie CrumARTS & CULTURE2016-05-19A Visual Survey Of Retro Computers That Predat...https://www.huffingtonpost.com/entry/retro-com...Elegant technology has a long history.
124933David Finkle, ContributorWriter, Drama CriticARTS2014-04-18First Nighter: Moss Hart's \"Act One\" in Two Gr...https://www.huffingtonpost.com/entry/first-nig...
19126Sebastian MurdockCRIME2017-08-12'Beautiful Moment Ripped Away' As Car Plows In...https://www.huffingtonpost.com/entry/during-ra...\"These terrorists aren't trolls -- they're ter...
57926GOOD NEWS2016-05-24Paralyzed Dog Was About To Be Put Down When So...https://www.huffingtonpost.comhttp://pubx.co/o...Ollie, a 10-year-old Shetland sheepdog (aka Sh...
86604Dragos Bratasanu, ContributorMake your dream a...HEALTHY LIVING2015-07-03How To Create Giant Success (And Live A Fulfil...https://www.huffingtonpost.com/entry/7-steps-t...There is a series of mental events that lead t...
36765Ron DickerENTERTAINMENT2017-01-19Woman Waving Palestinian Flag Accosts Bella Ha...https://www.huffingtonpost.com/entry/woman-wav...Police say a harassment report has been filed.
56098Minou Clark, The Huffington PostCOMEDY2016-06-1423 Things You'll Only Understand If You Still ...https://www.huffingtonpost.com/entry/yup-still...There's no place like home, right?
52090Kate SheppardPOLITICS2016-07-29Bill And Tim (And Hillary And Barack's) Excell...https://www.huffingtonpost.com/entry/photos-ba...Behind the scenes at the Democratic National C...
\n", "

100 rows × 6 columns

\n", "
" ], "text/plain": [ " authors category \\\n", "2317 Chris McGonigal WORLD NEWS \n", "20810 Bill Bradley and David Moye ENTERTAINMENT \n", "56479 Eliot Nelson POLITICS \n", "39517 Jenna Amatulli ARTS & CULTURE \n", "18634 Robert Koehler, ContributorPeace journalist WORLD NEWS \n", "18900 East-West Center, ContributorPromoting better ... WOMEN \n", "88345 Sebastian Murdock WEIRD NEWS \n", "62054 James Cave STYLE \n", "66784 SPORTS \n", "73038 SPORTS \n", "52487 Michael McLaughlin POLITICS \n", "111907 Bill Bradley ENTERTAINMENT \n", "64576 Zach Carter and Shahien Nasiripour POLITICS \n", "39143 Nick Visser HEALTHY LIVING \n", "87400 Beth Weissenberger, Contributor HEALTHY LIVING \n", "32935 Rebecca Shapiro COMEDY \n", "48774 Lee Moran COMEDY \n", "17443 Dave Jamieson POLITICS \n", "114621 Andy Campbell WEIRD NEWS \n", "87370 Michael Ernest Sweet, ContributorCanadian Writ... QUEER VOICES \n", "119406 Sara Eckel, ContributorAuthor, It’s Not You: 2... WOMEN \n", "117313 Lisa Copeland, ContributorDating Coach For Wom... FIFTY \n", "95992 Janell Burley Hofmann, ContributorAuthor, Spea... PARENTS \n", "3400 Mary Papenfuss ENTERTAINMENT \n", "96289 Ron Dicker SCIENCE \n", "91428 China Hands, ContributorFor Future Leaders in ... POLITICS \n", "29215 Mary Papenfuss SPORTS \n", "49611 Michael McLaughlin POLITICS \n", "53475 Leigh Blickley ENTERTAINMENT \n", "79857 James Michael Nichols QUEER VOICES \n", "... ... ... \n", "72907 CRIME \n", "102621 Nick Visser GREEN \n", "108212 Cynthia Dagnal-Myron, ContributorReporter, Aut... ENTERTAINMENT \n", "94707 Kevin Price, ContributorPublisher and Editor i... POLITICS \n", "54325 Sarah Guerrero, ContributorWriter & Stay at Ho... WOMEN \n", "21837 Cavan Sieczkowski ENTERTAINMENT \n", "102087 Mark DeCarlo, Contributor3 time Emmy Award win... HEALTHY LIVING \n", "101670 RELIGION \n", "44538 Nina Golgowski LATINO VOICES \n", "50770 Cole Delbyck ENTERTAINMENT \n", "50828 Cristian Farias POLITICS \n", "109132 Ayala Laufer-Cahana, M.D., ContributorPhysicia... HEALTHY LIVING \n", "65178 Daniel Marans POLITICS \n", "17870 Emma Gray WOMEN \n", "50296 Natalie Jackson and Ariel Edwards-Levy POLITICS \n", "2924 Andy McDonald ENTERTAINMENT \n", "80415 Vishavjit Singh, ContributorEditorial cartoonist RELIGION \n", "16535 Michael Tesler, ContributorAssociate Professor... POLITICS \n", "93889 Ron Dicker LATINO VOICES \n", "119723 Eric J. Hall, ContributorPresident & CEO of He... HEALTHY LIVING \n", "3827 Curtis M. Wong QUEER VOICES \n", "14693 Alexander C. Kaufman POLITICS \n", "58342 Maddie Crum ARTS & CULTURE \n", "124933 David Finkle, ContributorWriter, Drama Critic ARTS \n", "19126 Sebastian Murdock CRIME \n", "57926 GOOD NEWS \n", "86604 Dragos Bratasanu, ContributorMake your dream a... HEALTHY LIVING \n", "36765 Ron Dicker ENTERTAINMENT \n", "56098 Minou Clark, The Huffington Post COMEDY \n", "52090 Kate Sheppard POLITICS \n", "\n", " date headline \\\n", "2317 2018-04-12 Striking Photos Show Israelis Standing Still F... \n", "20810 2017-07-23 'Westworld' Season 2 Teaser Suggests Violent D... \n", "56479 2016-06-10 Donald Trump Tells Religious Conservatives He'... \n", "39517 2016-12-19 Sadly, 'Puppy' Isn't Merriam-Webster's Word Of... \n", "18634 2017-08-17 Why Does North Korea Hate Us? \n", "18900 2017-08-15 Asian Teams Participate In FIRST Global Roboti... \n", "88345 2015-06-14 This Is Not How You Play Frisbee, But We Love ... \n", "62054 2016-04-07 The Real Story On How Trench Coats Got Their Name \n", "66784 2016-02-12 New York Mets Pitcher Jenrry Mejia Permanently... \n", "73038 2015-12-03 New Wave of Arrests in FIFA Corruption Scandal \n", "52487 2016-07-25 California City Bans Deceptive Ads By Anti-Abo... \n", "111907 2014-09-15 Daryl Dixon's Big Secret Is Finally Revealed \n", "64576 2016-03-08 Elizabeth Warren Is Not On The Ballot And Her ... \n", "39143 2016-12-23 Scientists Create Effective Ebola Vaccine, Jus... \n", "87400 2015-06-25 The Soul's Ingredients: The Secret to Summonin... \n", "32935 2017-03-02 Seth Meyers Chews Out The Media For Gushing Ov... \n", "48774 2016-09-05 John Oliver Lists The Habits We Should Actuall... \n", "17443 2017-09-01 Labor Lawyers Blast Trump Administration For P... \n", "114621 2014-08-16 Wrong-Way Driver Hits Cyclists, Hides Meth In ... \n", "87370 2015-06-25 The Last Boys: A Book Of Photographs By Barry ... \n", "119406 2014-06-23 Are You Hot or Not? The Answer May Change With... \n", "117313 2014-07-16 Top 20 Dating Tips For Finding Love Again Afte... \n", "95992 2015-03-18 5 Tiny Stories From SxSW \n", "3400 2018-03-23 Bill Murray Compares Parkland Teens To Vietnam... \n", "96289 2015-03-14 You Might As Well Flip A Coin To Fill In Your ... \n", "91428 2015-05-10 Beyond the Gaokao: How Chinese Students Earn T... \n", "29215 2017-04-15 NFL's Todd Heap Runs Over Daughter In Deadly A... \n", "49611 2016-08-26 Trump And Clinton Supporters Find Common Groun... \n", "53475 2016-07-14 'Game Of Thrones' Stars React To Their Emmy No... \n", "79857 2015-09-18 Seasonal Queer Coming-Of-Consciousness Party '... \n", "... ... ... \n", "72907 2015-12-04 California Rampage Shocks Those Who Knew Shooters \n", "102621 2014-12-31 The 10 Best Photos From The Dept. Of The Inter... \n", "108212 2014-10-28 Jack Bruce: A Fond Fan Farewell \n", "94707 2015-04-02 Federal Laws: Too Numerous and Vague \n", "54325 2016-07-04 I Had An Early Miscarriage \n", "21837 2017-07-11 Ryan Reynolds Gave The Best Nod To The Glory O... \n", "102087 2015-01-06 Michael Symon: Cleveland's Real King \n", "101670 2015-01-11 Police Chief To Black Churches: 'We Can't Do T... \n", "44538 2016-10-23 Eric Trump Poses With Woman Wearing 'Latina Ag... \n", "50770 2016-08-13 Kenny Baker, The Actor Who Played R2-D2 In 'St... \n", "50828 2016-08-12 Federal Judges Can't Clear Someone's Record, E... \n", "109132 2014-10-17 Brain Zapping for Weight Loss \n", "65178 2016-03-02 Mainstream Republicans Are Unsure How To Stop ... \n", "17870 2017-08-28 Trump Confuses Two Female Finnish Journalists ... \n", "50296 2016-08-18 HUFFPOLLSTER: Without Donald Trump, Republican... \n", "2924 2018-04-02 'Broad City' Co-Creator Developing 'League Of ... \n", "80415 2015-09-12 The Sikh Boy Who Developed Breasts And Grew Up... \n", "16535 2017-09-14 Jemele Hill’s The Mainstream: Most Americans T... \n", "93889 2015-04-11 Zoe Saldana Opens Up On Facebook About Post-Bi... \n", "119723 2014-06-19 Many Doctors Don't Support Life Support When ... \n", "3827 2018-03-16 Newspaper Scraps References To Gay Man's Husba... \n", "14693 2017-10-07 Pittsburgh’s Mayor Calls For ‘An American Mars... \n", "58342 2016-05-19 A Visual Survey Of Retro Computers That Predat... \n", "124933 2014-04-18 First Nighter: Moss Hart's \"Act One\" in Two Gr... \n", "19126 2017-08-12 'Beautiful Moment Ripped Away' As Car Plows In... \n", "57926 2016-05-24 Paralyzed Dog Was About To Be Put Down When So... \n", "86604 2015-07-03 How To Create Giant Success (And Live A Fulfil... \n", "36765 2017-01-19 Woman Waving Palestinian Flag Accosts Bella Ha... \n", "56098 2016-06-14 23 Things You'll Only Understand If You Still ... \n", "52090 2016-07-29 Bill And Tim (And Hillary And Barack's) Excell... \n", "\n", " link \\\n", "2317 https://www.huffingtonpost.com/entry/israelis-... \n", "20810 https://www.huffingtonpost.com/entry/westworld... \n", "56479 https://www.huffingtonpost.com/entry/donald-tr... \n", "39517 https://www.huffingtonpost.com/entry/sadly-pup... \n", "18634 https://www.huffingtonpost.com/entry/why-does-... \n", "18900 https://www.huffingtonpost.com/entry/asian-tea... \n", "88345 https://www.huffingtonpost.com/entry/bosnian-f... \n", "62054 https://www.huffingtonpost.com/entry/why-are-t... \n", "66784 https://www.huffingtonpost.comhttp://pubx.co/h... \n", "73038 https://www.huffingtonpost.comhttp://www.nytim... \n", "52487 https://www.huffingtonpost.com/entry/oakland-b... \n", "111907 https://www.huffingtonpost.com/entry/walking-d... \n", "64576 https://www.huffingtonpost.com/entry/elizabeth... \n", "39143 https://www.huffingtonpost.com/entry/ebola-vac... \n", "87400 https://www.huffingtonpost.com/entry/the-souls... \n", "32935 https://www.huffingtonpost.com/entry/seth-meye... \n", "48774 https://www.huffingtonpost.com/entry/john-oliv... \n", "17443 https://www.huffingtonpost.com/entry/hundreds-... \n", "114621 https://www.huffingtonpost.com/entry/wrongway-... \n", "87370 https://www.huffingtonpost.com/entry/the-last-... \n", "119406 https://www.huffingtonpost.com/entry/are-you-h... \n", "117313 https://www.huffingtonpost.com/entry/dating-ov... \n", "95992 https://www.huffingtonpost.com/entry/5-tiny-st... \n", "3400 https://www.huffingtonpost.com/entry/parkland-... \n", "96289 https://www.huffingtonpost.com/entry/flip-coin... \n", "91428 https://www.huffingtonpost.com/entry/beyond-th... \n", "29215 https://www.huffingtonpost.com/entry/todd-heap... \n", "49611 https://www.huffingtonpost.com/entry/trump-and... \n", "53475 https://www.huffingtonpost.com/entry/game-of-t... \n", "79857 https://www.huffingtonpost.com/entry/psychic-s... \n", "... ... \n", "72907 https://www.huffingtonpost.com/entry/san-berna... \n", "102621 https://www.huffingtonpost.com/entry/departmen... \n", "108212 https://www.huffingtonpost.com/entry/jack-bruc... \n", "94707 https://www.huffingtonpost.com/entry/federal-l... \n", "54325 https://www.huffingtonpost.com/entry/i-had-an-... \n", "21837 https://www.huffingtonpost.com/entry/ryan-reyn... \n", "102087 https://www.huffingtonpost.com/entry/michael-s... \n", "101670 https://www.huffingtonpost.com/entry/police-bl... \n", "44538 https://www.huffingtonpost.com/entry/eric-trum... \n", "50770 https://www.huffingtonpost.com/entry/kenny-bak... \n", "50828 https://www.huffingtonpost.com/entry/federal-j... \n", "109132 https://www.huffingtonpost.com/entry/brain-zap... \n", "65178 https://www.huffingtonpost.com/entry/republica... \n", "17870 https://www.huffingtonpost.com/entry/trump-con... \n", "50296 https://www.huffingtonpost.com/entry/without-d... \n", "2924 https://www.huffingtonpost.com/entry/league-of... \n", "80415 https://www.huffingtonpost.com/entry/the-sikh-... \n", "16535 https://www.huffingtonpost.com/entry/jemele-hi... \n", "93889 https://www.huffingtonpost.com/entry/zoe-salda... \n", "119723 https://www.huffingtonpost.com/entry/death-and... \n", "3827 https://www.huffingtonpost.com/entry/texas-new... \n", "14693 https://www.huffingtonpost.com/entry/peduto-tr... \n", "58342 https://www.huffingtonpost.com/entry/retro-com... \n", "124933 https://www.huffingtonpost.com/entry/first-nig... \n", "19126 https://www.huffingtonpost.com/entry/during-ra... \n", "57926 https://www.huffingtonpost.comhttp://pubx.co/o... \n", "86604 https://www.huffingtonpost.com/entry/7-steps-t... \n", "36765 https://www.huffingtonpost.com/entry/woman-wav... \n", "56098 https://www.huffingtonpost.com/entry/yup-still... \n", "52090 https://www.huffingtonpost.com/entry/photos-ba... \n", "\n", " short_description \n", "2317 Traffic stopped for two minutes. \n", "20810 This time, the robots are in charge? \n", "56479 His somber address was a hit at the Freedom Co... \n", "39517 “Surreal\" won out over \"puppy,” “flummadiddle,... \n", "18634 “The bombing was long, leisurely and merciless... \n", "18900 By Xinxin Zhang, Research Intern, East-West Ce... \n", "88345 \n", "62054 Bet you never knew how Burberry struck gold. \n", "66784 New York Mets relief pitcher Jenrry Mejia has ... \n", "73038 At least some of the arrests took place at the... \n", "52487 Oakland's law targets pregnancy clinics that o... \n", "111907 \n", "64576 The progressive movement faces its biggest test. \n", "39143 \"When the next Ebola outbreak hits, we will no... \n", "87400 \n", "32935 \"Guys, seriously, do you have amnesia?\" \n", "48774 Temporarily not wearing white isn't enough for... \n", "17443 \"To place him next to the brave men and women ... \n", "114621 \n", "87370 \n", "119406 We all venture into the dating world hoping th... \n", "117313 1. Put in writing what type of relationship yo... \n", "95992 You told the panel that they changed your life... \n", "3400 When your idealism isn't \"broken yet,\" you spe... \n", "96289 \n", "91428 Qi recounts three stories of students making t... \n", "29215 The tragedy occurred in driveway of his Arizon... \n", "49611 But on other gun policy questions, the gaps ke... \n", "53475 So many (23, to be exact) noms for this amazin... \n", "79857 \"We don’t impose rules on people -- the vibe i... \n", "... ... \n", "72907 \"This was a person who was successful, who had... \n", "102621 \n", "108212 Clapton was God to some--or so the graffiti sa... \n", "94707 The vast majority of Americans clearly seem un... \n", "54325 Grief sneaks out during tired moments. \n", "21837 Bow down. \n", "102087 \n", "101670 \n", "44538 \"It stands as a testament to the lack of diver... \n", "50770 Sad beep. \n", "50828 A court said they are without authority to wip... \n", "109132 \n", "65178 Others are resigned to his victory -- and are ... \n", "17870 The Finnish president had to explain that they... \n", "50296 The Republican nominee should be polling bette... \n", "2924 The project reportedly received the blessing o... \n", "80415 In my early childhood photos I look like a hea... \n", "16535 Jemele Hill, the ESPN host of SportsCenter’s “... \n", "93889 \n", "119723 A recent study of more than a thousand doctors... \n", "3827 The publisher of Texas' Olton Enterprise said ... \n", "14693 This could be the key for Democrats to win bac... \n", "58342 Elegant technology has a long history. \n", "124933 \n", "19126 \"These terrorists aren't trolls -- they're ter... \n", "57926 Ollie, a 10-year-old Shetland sheepdog (aka Sh... \n", "86604 There is a series of mental events that lead t... \n", "36765 Police say a harassment report has been filed. \n", "56098 There's no place like home, right? \n", "52090 Behind the scenes at the Democratic National C... \n", "\n", "[100 rows x 6 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sample(100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Date range\n", "Articles are between July 2014 and July 2018" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.date.hist(figsize=(12,6),color='#86bf91',)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Category Distribution\n", "\n", "### Number of categories" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "31" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(df['category'].values))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Category by count\n", "\n", "Most of the articles are related to politics. Education related articles have the lowest volume." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAFICAYAAAC7oDIfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJztnXn8bVP9/58vQ6bMLsnQFUpKyE2GBvHNlKJBXJVbKSoKpaKv0qQ0S0k/RShDUqIS+aJkdmW4LsrNEBpcsybC+/fHe+37WWfvfc7Z+3xmn/fz8diPs/faa6+99jn7rPda7/V+v5fMjCAIgiDIWWi8KxAEQRBMPEI4BEEQBBVCOARBEAQVQjgEQRAEFUI4BEEQBBVCOARBEAQVQjgEQRAEFUI4BEEQBBVCOARBEAQVFhnvCgzKSiutZNOnTx/vagRBEEwqrrnmmvvMbFq/fJNWOEyfPp3Zs2ePdzWCIAgmFZLubJIv1EpBEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYVJ6wRXMP3gX9am33HEa8e4JkEQBE8fYuQQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVOgrHCQtLukqSddLmivp0yl9LUlXSpon6UeSnpHSF0vH89L56VlZh6T0P0jaLkvfPqXNk3TwyD9mEARB0IYmI4fHgK3NbENgI2B7SZsBXwS+bmbrAA8Ce6X8ewEPpvSvp3xIWh/YHXghsD3wbUkLS1oYOBrYAVgfmJnyBkEQBONEX+Fgzj/S4aJpM2Br4IyUfiKwS9rfOR2Tzm8jSSn9NDN7zMxuB+YBm6ZtnpndZmaPA6elvEEQBME40WjOIfXwrwPuBc4H/gQ8ZGZPpCx3A6ul/dWAuwDS+YeBFfP00jXd0uvqsbek2ZJmz58/v0nVgyAIggFoJBzM7Ekz2whYHe/przeqtepej2PNbIaZzZg2bdp4VCEIgmBK0MpaycweAi4CNgeWk1QsM7o6cE/avwdYAyCdXxa4P08vXdMtPQiCIBgnmlgrTZO0XNpfAngNcDMuJN6css0Czkr7Z6dj0vkLzcxS+u7JmmktYF3gKuBqYN1k/fQMfNL67JF4uCAIgmAwFumfhVWBE5NV0ULA6Wb2C0k3AadJ+hxwLXBcyn8c8ANJ84AH8MYeM5sr6XTgJuAJYF8zexJA0n7AecDCwPFmNnfEnjAIgiBoTV/hYGY3ABvXpN+Gzz+U0/8D7NqlrMOBw2vSzwHOaVDfIAiCYAwID+kgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEIIhyAIgqBCCIcgCIKgQgiHIAiCoEJf4SBpDUkXSbpJ0lxJ+6f0T0m6R9J1adsxu+YQSfMk/UHSdln69iltnqSDs/S1JF2Z0n8k6Rkj/aBBEARBc5qMHJ4APmxm6wObAftKWj+d+7qZbZS2cwDSud2BFwLbA9+WtLCkhYGjgR2A9YGZWTlfTGWtAzwI7DVCzxcEQRAMQF/hYGZ/NbPfp/1HgZuB1XpcsjNwmpk9Zma3A/OATdM2z8xuM7PHgdOAnSUJ2Bo4I11/IrDLoA8UBEEQDJ9Wcw6SpgMbA1empP0k3SDpeEnLp7TVgLuyy+5Oad3SVwQeMrMnSul1999b0mxJs+fPn9+m6kEQBEELGgsHSc8EfgIcYGaPAMcAawMbAX8FvjoqNcwws2PNbIaZzZg2bdpo3y4IgmDKskiTTJIWxQXDyWb2UwAz+3t2/rvAL9LhPcAa2eWrpzS6pN8PLCdpkTR6yPMHQRAE40ATayUBxwE3m9nXsvRVs2xvAG5M+2cDu0taTNJawLrAVcDVwLrJMukZ+KT12WZmwEXAm9P1s4CzhvdYQRAEwXBoMnLYEng7MEfSdSnt47i10UaAAXcA+wCY2VxJpwM34ZZO+5rZkwCS9gPOAxYGjjezuam8jwGnSfoccC0ujIIgCIJxoq9wMLNLANWcOqfHNYcDh9ekn1N3nZndhlszBUEQBBOA8JAOgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCqEcAiCIAgqhHAIgiAIKoRwCIIgCCr0FQ6S1pB0kaSbJM2VtH9KX0HS+ZJuTZ/Lp3RJOkrSPEk3SHpJVtaslP9WSbOy9E0kzUnXHCVJo/GwQRAEQTOajByeAD5sZusDmwH7SlofOBi4wMzWBS5IxwA7AOumbW/gGHBhAhwGvAzYFDisECgpz3uy67Yf/qMFQRAEg9JXOJjZX83s92n/UeBmYDVgZ+DElO1EYJe0vzNwkjlXAMtJWhXYDjjfzB4wsweB84Ht07llzOwKMzPgpKysIAiCYBxoNecgaTqwMXAlsIqZ/TWd+huwStpfDbgru+zulNYr/e6a9Lr77y1ptqTZ8+fPb1P1IAiCoAWNhYOkZwI/AQ4ws0fyc6nHbyNctwpmdqyZzTCzGdOmTRvt2wVBEExZGgkHSYviguFkM/tpSv57UgmRPu9N6fcAa2SXr57SeqWvXpMeBEEQjBNNrJUEHAfcbGZfy06dDRQWR7OAs7L0PZPV0mbAw0n9dB6wraTl00T0tsB56dwjkjZL99ozKysIgiAYBxZpkGdL4O3AHEnXpbSPA0cAp0vaC7gTeEs6dw6wIzAP+BfwTgAze0DSZ4GrU77PmNkDaf/9wAnAEsCv0hYEQRCME32Fg5ldAnTzO9imJr8B+3Yp63jg+Jr02cCL+tUlCIIgGBvCQzoIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCo0Fc4SDpe0r2SbszSPiXpHknXpW3H7NwhkuZJ+oOk7bL07VPaPEkHZ+lrSboypf9I0jNG8gGDIAiC9jQZOZwAbF+T/nUz2yht5wBIWh/YHXhhuubbkhaWtDBwNLADsD4wM+UF+GIqax3gQWCv4TxQEARBMHz6Cgczuxh4oGF5OwOnmdljZnY7MA/YNG3zzOw2M3scOA3YWZKArYEz0vUnAru0fIYgCIJghBnOnMN+km5IaqflU9pqwF1ZnrtTWrf0FYGHzOyJUnotkvaWNFvS7Pnz5w+j6kEQBEEvBhUOxwBrAxsBfwW+OmI16oGZHWtmM8xsxrRp08bilkEQBFOSRQa5yMz+XuxL+i7wi3R4D7BGlnX1lEaX9PuB5SQtkkYPef4gCIJgnBho5CBp1ezwDUBhyXQ2sLukxSStBawLXAVcDaybLJOegU9an21mBlwEvDldPws4a5A6BUEQBCNH35GDpFOBrYCVJN0NHAZsJWkjwIA7gH0AzGyupNOBm4AngH3N7MlUzn7AecDCwPFmNjfd4mPAaZI+B1wLHDdiTxcEQRAMRF/hYGYza5K7NuBmdjhweE36OcA5Nem34dZMo8+nlu2S/vCY3D4IgmCyEB7SQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFEA5BEARBhRAOQRAEQYUQDkEQBEGFvsJB0vGS7pV0Y5a2gqTzJd2aPpdP6ZJ0lKR5km6Q9JLsmlkp/62SZmXpm0iak645SpJG+iGDIAiCdjQZOZwAbF9KOxi4wMzWBS5IxwA7AOumbW/gGHBhAhwGvAzYFDisECgpz3uy68r3CoIgCMaYvsLBzC4GHigl7wycmPZPBHbJ0k8y5wpgOUmrAtsB55vZA2b2IHA+sH06t4yZXWFmBpyUlRUEQRCME4POOaxiZn9N+38DVkn7qwF3ZfnuTmm90u+uSa9F0t6SZkuaPX/+/AGrHgRBEPRj2BPSqcdvI1CXJvc61sxmmNmMadOmjcUtgyAIpiSDCoe/J5UQ6fPelH4PsEaWb/WU1it99Zr0IAiCYBwZVDicDRQWR7OAs7L0PZPV0mbAw0n9dB6wraTl00T0tsB56dwjkjZLVkp7ZmUFQRAE48Qi/TJIOhXYClhJ0t241dERwOmS9gLuBN6Ssp8D7AjMA/4FvBPAzB6Q9Fng6pTvM2ZWTHK/H7eIWgL4VdqCIAiCcaSvcDCzmV1ObVOT14B9u5RzPHB8Tfps4EX96hEEQRCMHeEhHQRBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVAjhEARBEFQI4RAEQRBUCOEQBEEQVOi7nsNUZYMTN6ikzZk1ZxxqEgRBMPbEyCEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoEMIhCIIgqBDCIQiCIKgQwiEIgiCoMCzhIOkOSXMkXSdpdkpbQdL5km5Nn8undEk6StI8STdIeklWzqyU/1ZJs4b3SEEQBMFwGYmRw6vNbCMzm5GODwYuMLN1gQvSMcAOwLpp2xs4BlyYAIcBLwM2BQ4rBEoQBEEwPoyGWmln4MS0fyKwS5Z+kjlXAMtJWhXYDjjfzB4wsweB84HtR6FeQRAEQUOGGz7DgF9LMuD/mdmxwCpm9td0/m/AKml/NeCu7Nq7U1q39AqS9sZHHay55prDrPrIcfN6L6ikveCWm8ehJkEQBCPDcIXDy83sHkkrA+dLuiU/aWaWBMeIkITPsQAzZswYsXKDIAiCToalVjKze9LnvcCZ+JzB35O6iPR5b8p+D7BGdvnqKa1behAEQTBODCwcJC0laeliH9gWuBE4GygsjmYBZ6X9s4E9k9XSZsDDSf10HrCtpOXTRPS2KS0IgiAYJ4ajVloFOFNSUc4pZnaupKuB0yXtBdwJvCXlPwfYEZgH/At4J4CZPSDps8DVKd9nzOyBYdRrQnP0ey+spO37na3HoSZBEATdGVg4mNltwIY16fcD29SkG7Bvl7KOB44ftC5PR76620616R/+0S/GuCZBEExFYrGfpwF3H/y72vTVj3hFJe1Tn/pUo7QgCKY2IRyCrlxw4dqVtG22/tM41CQIgrEmhEMwIjzrousqaX979UbjUJMgCEaCEA7BmDL94F/Wpt9xxGvHuCZBEPQiorIGQRAEFUI4BEEQBBVCOARBEAQVYs4hmLC0mp/41LI1aQ+PcI2CYOoQwiGYcmxw4gaVtDmz5oxDTYJg4hLCIQh60DQce11YFKgPjdLG+73OwbHOuTEIRpqYcwiCIAgqxMghCJ4mtAmNEt7vQT9COARB0JU6z3eo934PB8enF6FWCoIgCCqEcAiCIAgqhHAIgiAIKsScQxAEY07d/ETXuYkWDo5NfVjqTJSh3kx5qhLCIQiCoAfD9WGZrKs3hnAIgiAYB9o4OI7HCo4hHIIgCJ4m1PmvwGA+LDEhHQRBEFQI4RAEQRBUmDDCQdL2kv4gaZ6kg8e7PkEQBFOZCSEcJC0MHA3sAKwPzJS0/vjWKgiCYOoyIYQDsCkwz8xuM7PHgdOAnce5TkEQBFMWmdl41wFJbwa2N7N3p+O3Ay8zs/1K+fYG9k6Hzwf+UCpqJeC+hrcd77zjff/Ryjve9x+tvON9/9HKO973H628433/0co7EmU+x8ym9b3azMZ9A94MfC87fjvwrQHKmT1Z8o73/eO54rkmwv3juSbGc9VtE0WtdA+wRna8ekoLgiAIxoGJIhyuBtaVtJakZwC7A2ePc52CIAimLBPCQ9rMnpC0H3AesDBwvJnNHaCoYydR3vG+/2jlHe/7j1be8b7/aOUd7/uPVt7xvv9o5R2t+1eYEBPSQRAEwcRioqiVgiAIgglECIcgCIKgQgiHIAiCoEIIhww5i2fHMyRtkbalxrNuZSQtM0b3WV6SatI/Pxb3Hy26PdcA5Ww2EvUZayQtKmljSSu3vO5No1WnQZC0pKRFs+PnSzpQ0huHWe5Xs/2yM+5xwyl7sjBphYOk50haNjt+taRvSPpQMofN8+5XLaGWLwIHZMdnAJ8ADgcOLZX5Hknrpn1J+r6kRyTdIOklDerfrdFt+lzXStq94XPV3b/SOEj6pKT10v5iki4C/gT8XdL/lIrYftB7jxWSVkmfbZ6rLd9uWJcThnmfbuVune2vVTr3xmz/O5JemPaXBa4HTsLfo5ktbvn10j2OzPb3L507oXTcqiGXNL1Bfc4Fpqf86wCXA88F9pX0hZoy18v2FyudywX9q7P9d5WK2bh03c8lnd1tG+T+bdsXSa/stZXzN2I4HnTjuQFXAs9O+xvhbuIfBk4k87ZO53/fsMxrgUXz4/Qp4JJS3huLvMAewDXAisD/AL8r5f0ksF7aXwy4CHgAuBf4n0GeC3gOcCZwPrBOg2f7DvDCtL8scBMwB3c2nJnS5zJkwbZ3qufCwAuAq0rlXQ8sD6xQt5XyPgo8krZHs+N/AU+U8p6e7X+xdO7XDZ5zOWAv4ALgL22fK+X5PnB8l+24Ad+tRvmy/Hv22urKLd+jdG5utn8A8LO0/6ziPW9Yr7t63KPr/dPxxcC6aX+d9B/4ZvqtvlBzr3nAwcAiPeozJ9v/LHB02n9Gfm6A7+vauv0u170qbVsBt2bHrwJeNeD9G7cvKc/Pa7azgTuAJ9u8e8U2IfwcBmQJM/tL2n8b7hvxVUkLAdcNWKbM7L/Z8ccBzMwkPbOU94ks707ASWZ2P/B/kr5Uyrsb/uICzEqf04Dn4Y3+/7V9LjO7E3iDpB2ASyVdDTyVnX99qQ6vMLP3pv13An80s10kPQv4FXAq8LilNw3YDjjNzJ4EbpZUflfWw1/YOtWM4b23oi5L5yfTd7kvsA8u4HLWzfZfA3wsO66NByNpCTxQ4x54r25pYBe8MaLlcwHULfq7BnAgLlRynlvuHeZkv8OSkjam/vvCzH5fSnpplyJfD6yG9/oplVcuOz9+PNt/DfDjdN+/tdSulW3fe92/zPJmdmvanwWcamYfSCPia4BDSvk3Bj4DXCNpPzOrrqvZWZ+tgS8DmNnjkp6qyd/0+1pI0tK4dqXYL853vANm9tsFBUj/yI+Hcf827Qtm9rqOgqQtcW3H34AP9KhPVyazcMi/yK1JL5aZPVXzsr9Y0iNdyjAzK/T3i0l6ppn9I5X1K1ig31+idO1TklYFHgS2wVVPBeW8bRqnxs8l6fnAQcDv8JDndX+GBXXI9rs1Do9JehHwd3xYfVB2zZKl8m4ys41pgaTl8F7rnsApwEvTC5/Ty/Gmck7SKcArgF/jvdAL8Qi/v8mytXkuzOwnWfnPxTsJrwSOAMr65vnAV+nPailfN2HasVq9mS34Qyf141txQXkFne+addkvHz8kaSd8pLglProivX8d76ukOTVlkeq+SiltIUnLM9SILk+XRpSWDbmZPQocKGkT4AJJd+PvePG/fTFwg6SvpOdaB38Pinetjqbf14qkEWc6vimd7ycA+zmONb1/m/ZlAZK2wVXhBnzezM7vU5+uTGbhcKGk04G/4uqNCwHSF/p4Ke+chg3ZccCpkvYpeu+SVgeOodoofBKYjf8Bzrbk0S3pVcBtpbxtGqeLmjyXpCPw3vKHCiHWhyaNwwH4PMs04OtmdnvKsyOuchsISSvhqrHdcNXMxmb2cJfsRQ97IWCJrLct6v8U6+N/oJuBm83sSUnlP13r50q64UPx3uuXgfea2RM1Wfv1FAvmmdnW/bN11GER4B34+3IF8GYzK0ciLkYuonMUIyCfg9gHOApXIx1gZn9L6dsAvyyVuVOLai5L5wgyHwGVf4e2DXkxp/IN4HvUd4DeA+yPzztsa2b/SunrA1+pKXJ1SUel+hb7pOPVFlTcbPVudaqp4wrZ4cIlAYmZPdD2/rRrX5D0WuB/gYeBQ83skqb17/pcQx3ayUXqTe0GrIrrqe9J6RsDK5vZeVnea5v2cuWT14fgglPAf4EjzOybNXkXAZY2sweztKXw7/UfWdpmwAl443SkmX02pe8IvN3MZmZ5Gz2XpM8BnzOz/zR8rucx1DgcaWYnpPTt8D/Vh5uUk5X3jqKMLG154CErvVSS/on3sL+Pzzd0YGZfy/L+hh69LzN7dTktNeQz8e/tPjyc+4vM7O/Nn6ijvB8Dm+A9/dOBJ0t1eCDL+1Mz62sZ0+YdTPn3xRu9C/C5lzu65HtVr3IKwSVp+fw97XPv9czslrS/mJk9lp3bzMyuyI6fk1ScTcpdAn+mVXF16fUpfQtgbTP7QSn/aXgQzveZ2ZwuZe4CXGZm9zasw6xe583sxD7Xrw0cZGbvy9Jup/uowszsuVnexvdv2r6k9KeAu/G5wMr/p0bN3J9BJiomwob3PLasSX85/qLlaR8foPzlcR1pt/MfzfZ3LZ37/DCeq++ka5Z3RVyfeHTa9gNW7JJ3vwblHZnt7186d0LpuM0k+6eAw7ptI/xeFI36n/FGA+B+4By8Z/VqYMk+ZdwB3J6229K24LiU9+24gC+X8XZgj+x429L5RfFRycpd6vAUri+eA9yQbXOAGxp+Fz/K9u/FVSPfxeecntfjujaTzI0n2svvUIP8BzXIcwY+ErkVn7/bG+8YdMu/ODCtJn0asHh2/KL0zlyX3t9VgB8BfwE+MpLvbJd6tmpfKE2Cl7dB6jCZRw6/AA6xUo9C0gb4l/e6LO0wuvdGzYZ68nv0uqeZnZKV+Xsze0l5v8vx6/A/9J3p+JPAm4A78Ub49ixvox6mpBfgKqfzcNWI8MbmNcDWlnp+3erUpcw2zzQX/xOafBGmmbglxfOAE81s037P0KUOPXvhZvbThuUIn4S/OM0ZbQZskbZN8Ib+UuBSMzt9kLqm+1wJbGPVntxSwMVmtkk6/g7wTTObKzclvRwfkayAN4Knlq5/Tq/7WoPeuqQ/m9ma2fHzGPoOtsAbxCvw7+BLWb4F72D5fex33Kc+fd/BQfPLzXg3x59rc2BN4Goz27GU71jg3PJ7JOkNuAB/Xzq+AldlXY6bbR+EG238r5n9u3TtdngP/4xS+puARyzT+0t6OfBcMzspHZ+BvwPgmoBCjdz4v1i65+J4xxlcldlIs1DHZJ5zWKUsGADMbI6q9tH/KOfDdf3vxnvfhSXRK7rca0d8eHtKltbU6gB8MmkzgKT3fxvemG6Mm5hul+VdtlcDmb3Un8UFS0fDll7Iw3Hh05Y2lieNJ9klnW5mb0n7XzSzj2Xnfm1m22bZz8B7a4VlVl4PA8p/6m/SexLwYjN7BNdvFzrupfDe8wH4aKv8Hb7NzH6Y9rc0s0uzc/uZ2bey7IuWBQOAmf1TmU0/zazF8uuLjsRawAtT8k1mVtE3N8XM/gj8ETghqUd2xNU82wK5BUzTSVOA1TK9ed09P5gdtrXYaoyZ3S73G1gibYtTP0e1iZntXU40szOTqrZgcTP7Xtqfm373D3W5/Sdx67gyv8VNSvNJ4U/TaT30fHxeaSnc8OHClN6mfSlUUJ/HfTLuTHnWkPR9XKD9t3xNPyazcOg6iUXppTCz3NtxafwP8S58reqvZvnel18ndzI7GJ8YKv/4bf5AZkMTZW/EbeWvwU303l/Kuyw+IdjNqqVoHDcwszdXMpj9RPXey00sttpYnrSZZG9jnvpGfD2PFwNn4eaO82rqXTC7xzkAJD2bod5yYSJ6DT7hfHnNJR8Cfpj2vwnkvbR3AblwWELSUmb2z9I9l8Zt7QtamZKm0c73gBkMCcqNJF0D7JUEHurucClcdVWUVzz/5rhZ7m34qOFtdE4iQ/NJU4B/499lE1pZbAHrSbqhJu8CayVJH8efaRq+bPAV+O+zd+qslKlYp2XkTsGLJy1EUdf/5MdmltdrMTObX3kgs/tUjaywjJndlB3fmtoC1Om016Z9ATeaWBpYy9zKq3iHvpK2/Wuu6clkFg6zJb3HzL6bJ0p6NzUvq9yi4EO4SeCJwEusZoJO7k+wJ/ARXF2zR+nHLNgwNbbCG4ii4RXeaykVq2fiTl/b0OlVW857p5mVPTLr+GfLc00sttpYnuxPcwugxuapZvYz4GfpT7Uz8FVJK+K9n4pVkPWZQEzcnZ7l68DBZla2ZivTptd2HHCGpPdmvf3p+BxQbuHW2JQ0cRQ+R7C7mT2V8go3U/wW/o5CbzPaXLV4CUPfwZlZZ6WOj2T7ZeFbPr6/4W8A7S22bgde1yfPnvj7/nPgMuBK624JB3CvpE3N7Ko8UdJLcaOJgvvo/J/mx4abNhcsI2kRK1mzpZFj+bft6NRapzFDbibcpn0B71A+LxvNY2aPSHof/h5MKeFwAHCmpLcyJAxm4L21N+QZJX0Z75Eei/e469RMSNoHd3T6HfB6M/tTt5ubWbkn3Ysj8d7fI7i55ex0v41xk9WOajQsc2VJdcNc0cVZrB9mNr1F3itxR7hy+jn4RF5OW/NUgP/gZnmP4N7gdX8IYIEFyP74EB3crPWoQq+LN8ab4+/FhyTdgY8YLsfX2X2ss8TmvTYz+4qkfwAXa8hR8h+4hdsxWdbClHRV+puSghtbvKN0LwM+I+nWLHnnYhTRh3z0tE8SSr8nfQ8lddW/gJ831Ff3E7TD4fF+cyt1sYEBAAAgAElEQVRmtl7q+G2BeygfnH6H63GDhO+XLvkIcLo8tEfebuyJj1iLcl/eop4/Bb6bVE//BEh1+AYlNShwi6TXmlnHb546DgvMlFu2L+mS6gSy1Zt2N2LSTkgXSHo1blkAHiLgwpo8TwGPAU/Q+efucIJL+f6OW4nU5csnhV4KrGQlHwO5x/K9xVAxS18NWBm4PusJrorrrP+c5XuRmd1Y8wwL4WEuTk7Hh3X/VsDMPl26/uNm1jNYXpuJtaKu+J+t0InPBb5SngtSC/NUuV377sCmuOf4aYUw7VLnWXhH4UN4YydcDfRl3PrqBzXXTMd7pPsDq5vZ4qXz/8JDNwhYO+2Tjp9rZrVBGJMqiWxYv4oNbk57q5mt2+XcPDNbJ+3/CR9Vnday/CVxFdkBuCpi4ezcmbhAPQ+fCzmvi4qm+C4fLHrr6f+4C673/lY+SpO0rZn9uks5HXM7Ke1bZtY0LloxCtsE79XvU36uLN8qwPvJ2o1U13uzPGXTT8NHDzfUqBAXAT6Hz2EWwmxNfOT4iVzfL4+X9At8lFOMzDfBhdtOaV5okPblZ8BPsw5Rkf424C02gCnrpBUO6nQ8qWCdjidNy1y7T5l/yvJeCLyz3LORW5l8Px8+t5ngTHrCfXH97Nn4ZNZ+uBPZ9Wa2c9vnSuX2tdiSdCmwS1l/Kndi+7mZbZ6l7YzrMr/AkKphBu4jcpCZnTVgPZ/CTTYvSfUt99Q/WMp/Ba56uaOUPh0XLIUhwHoM9Zy3xIf3haXOV0rXDmwpJHfoehMeyuMFZvbs7NwO+BxWLky/mEZb5XJOxIMDfjbvEUr6BK4+eHtW1yOBZ+L+ALXzM3ILqcKaZwvcGOJWfORwaU2HYBl8pLU7HuOrmP/5bSnflcAbzOwvkjbCBfoX8Dmj/5rZu7O8CwNvwd/tc83sxtRj/jgeNqYc0O6EYvQkaVad+io14sVv+sL0nV6anuuy8rvcFEmVTgVuVbQ+/r//Tc01S9BpKfTvcp6UbzFcvZ2/B6fkI7U27UtKXw0fpeRzQDPwkfkbLPlLtWEyC4fc8aSul587nmxtQyZia1mn6egbraF5ZOn+V5tZbfwbSTeYu/YXx21MRM/CPX4vx1UOK6dn2t/MrsvytbEAQlKdk9sCiy0ze6ak2WY2o+EzXY+rNO4o5ZsOnGVmG2Zpjc1TJb2D3qOMjgZC0k1mtn6XOt9kZutLug+3T78cbzgu69aIpusaO4GltK6xnbJR4nvw3uxH6RSmR+ABFY8tlbkM3vN8CdmEND6f824ze6iUfwfc0bI2xpak+Qyp0i7FzTxrG6+a72NF4M14b3sFM1sjO7fgvZB7Pz9lZh9NI93rSu/MCfhk+FXAy/DfZAY+D/Szmvt2/d9keX6anucy4BrrM58kj8jbq5O0TZ/r18KFZB5B9aOWTIEl7WpmP87Ofd7MPp4dN3q32rQvpXNb02nddkGv5+mJjbIzx2htwHNa5G0aCfFB3JGrvD0IPFC6bl6P+80rHfeK8Fg+zqNMLow7Ly1ec49r656hrsyaa5fGLXVux8OUr5zS/0hNBEzc6uXWUtrcHuXfVDp+Ch9CF5FNv59txw/zPbim3zlg2ZZltnECOwW4C2/IX5N+s9vrvhNK0WpT+or4PFS3uqyNq8BeR8m5M8vzfNyT+gzccqyV81Pdb56dWx4PUXFhes6v93hffw9slx3fUMo7F1go7S8OPEQXp81+v0OWvl62v1jp3GY1+Tep2fbF1UFXt30/BnhfmrZFjduXlFYbHZmaKMlNt8k8IX0mnSaGvWhqfbJSi/v/n6TD8TgmBiBJuB1zed6jjVnaAv2k+WTS3VY/MdjYAqhA/S222kysPSFpTcvmS1L+5+BzOzmNzVMl/bzXs1lVd/oCdTd3LEaP38XVGY1GWTR/X6BZbCfwUXpF1Wlm96velDV/twuVwLJFuiWfAA3F2DrQzM6tuW9R3iWWJlkl/cCSWipxFdl/Kf3mb2DIF+ds3K/mN8W7ntEmxtljlkZSZvYfSbdZNfBiTjeTWlIZH8SFc1H3y+lsE75dOsYyXb089MgncEH1XmsQo0y+ZkT5udq8L03ztmlfwFVJXTUpZFGSmzKZhUNTqx5o3jj/FH/ZzurSIOd8GLdDnyepGPZviKsM3l3KW9hrC1g7a8zyBqwgN2GDITO2jslzWloAqZnF1qH4xNqdkhY40pAm1kp5D8Nf4M/TqeM8mE4/BqydeWpdsLRevKBBnrZhwNtYK22kodhO/5dUWEurOhn9iKQNLcUTKpC0ITXxpuhtopr7BLwUD2TY733NJ9FfWDpX/i/dgS+i8218MrqXA9UBDMUCe3mW91mkkPcZ65Xe/bWz/8VTlqkiE71Mauvq3tdZDEBueHEobqRyuJldVJPnTKq//Qr4RPOepfQ2nb+medu0LwBbWcMYV02ZzHMO9+JObLVYNnEp6SE8tr9wL+gizr/wF3r5lO9NeA/3VfjE2qnAr6w+GmdR9nPJJpasxoNVIxAKoabM39DQAijlb2SxlfI2nVjbEH+J84m1r5YbwCz/wngogt2BDYCPWRYgsR+qt2jpq8NtM+eT0op3S3jDV7xnwi0/ymGr82s3wQXFW4C7zWyLlP5y4GRclZYL01nA22zAKJrddPG98jWY91qi7jeXtAY++f/lBvd7Rcq7b5ZW9z8oOiCHWCnURRMG+G2vxjsEX6bGATIbkZXnHgyP0XWLlUyf03/rHwx1zAofEuEq4dwZsdW71aR96fasw2UyjxzaeGbmFj7lnumCY/M4/j9Jw+qd8SBex8rjOJ2S9zC6DPuXKw/70/4gjf+rGXopbrSSdYSZbdWmPDPruyRsl4njdQu1h1Un7v+C2+7Ps9IEaancsnnqN6yLear6WLRQWqKRZmqFtn4WbZzAinqvZGb32ZDn+0fIwrGY2SWSNsX12+9IyTfhevG/1ZS3DB4i5tZ0vGtW1/OsvYnscvL4QQul/eK3Fu78uIBcMEiaBuyKC7xnU12cKa/zxvik/K74fNZP8vP5/6Bf3pSniYqxjTc3uMPcP/AJ9nKEgXxEdpCZ7dDt3iWut+YRdxu9W23al+KShvdvzGQeOYy4pOxynw3wVbdebJ224JWhaIZZpylrYVm1ICk7NjNbO8tbmKT9hyHhtwklk7QuDXlegXIMor4WW/I4LL2e6V3ZNe/GY7n8CVgLD1dQuyKaWpinqr1FS98gcW1HWb0oz7PIgyoej4/InsR7f5d1uXYjfEQ218xu7nOfY3GrqhPS8Tw8BtMS+Cph703pxai427MV1kq9flvM7J3ZvZfGVZB74IEUfwrsZjVrHMiD+c1M23145NKDzKwySmiTN+V/VZ86/1bDDMHdjTbty0i1RZK+YmYHpf3G7UvK31iT0rg+k1g4XGGZOVmfvHUTlguwklmY3K5/V7y3+xzcCuRUKzmetKjriqWkhfDe8UG4hcKbsrxn4nMeJ5TK2BN4kyU/h9Tgdg1QZ6UQHG2H3w2e6Ubg1WY2Pw19T7bMD6KUt/EfOJX7YvOV7xbHHRLXti4TlyP9XNm1m+M9z4vN7F5JL8bnU15hJVNOXCDcIullwJfMrNKoySPxvg0X+C/D10z+bjlflv9a3GCgmIzMhWA+uXwr9TpowBvQAZ7937hwPhRfO93kk8eVSc30Hv4Oj/c0L6UNO286VzF4GC5qaHYq6TZ8PqWWvCMkX6Huaz3ydj1XqltHFN02yOcIP9mjDq2F5GRWK+2r7kHHysOup/Ce4yl4DJZuOvR34r2aFwE/wydhf1f8QWvyr4yrCXKd+9FWWnikaNjktt9vx4eW1wGvtWrcpvXN7A2lNMzsJEn/myW1DVDXaOJODb2e8dAG81PdbpM79tRSavyfmdJqQ5ikckfComWBWkHSK7tdnO7T0fOWT97vhP9GH5N0Ht4AfwH3Ks55wtKch5ldmXrddewGbGRm/0qdhXNxK6puLFJ673Lrojw+T6OV6FLnohtmnZ7kh+Dv1rfxlRF/1OPa4j28SNK5DOnTh5sX/D9YCP6f5J2ogjQi6tbDNTPbq5S2O0MRaA8hBUBMbM/QJPqyuNqprn6GW3AVLIw7IQ5XtdNxfdP2JdEmxlUjJrNw6GXVkusOyxYlp+C63lPwhXXyyeZX44HJzrcek9AA8gW8T8EdjwqX9U2AqyS91Tq9oBfFG5UDcdXKLj0a8tq5gSRYFqi1rGWAOhpYSajT67mwlpkB/FRS2eu5bFrYcVwexsoDgB1CspqRxyP6opnlwc2gu0VLUW7Z+aeJDvcjVDFcsK5BNeLsa0kWQPLItHfha1fcUVNOOcZVx3HWa3zMUrA7c/PVfnNAT0l6lqX5CEshVZLaMV8q88E8XzHCxO32P2VD5rO1DlXA63EhukA4mNmRwJFpRLg73kg/W9LH8KB9f8yu/4WZ5e/hAek7OCbl/XVWbvmd7Zo3kTeW3Uwxf1GTtgb+X6uLT9TUuulOM+slUHP+amafaZJR3SM7FHNgRb7G7UtixGNcTVq1Ui9U48VaOr8bHjXzi5ZZXiTVQVcsC9MrD9vwPjPriECa9Mr/z8xelqXdjeukj8RXKCuXm3sIfx3vhRxgQ74GS+FC6z81jW4jCyA1sNhSO6/nNqqiQ/EQB/tZsrZIDc838Cian8vyrotHp7yrVOQawN/6jI4akf54h+J2+Yeb2c9L58vqqa4L2qhhjCt1zg2Uf4OK/4Y8Js7+uDVY8Y69BBfeRxU9fUm/x1feeyCNkE7D1wvYCA/fUQnrLkm4r8vH8I7S4dYZgrruOV+Ez0G8xVJcp+L+ZdVdEqi74vMU/TyOu+btpTLsUtZz8Z7/K/H/y3FW8phuqobs9ZvX3LdN3mL+sXaUYWZrpXyN25ce91ob/812N7Oy+XL/uj5NhUNFd5d6XLvjzj0P4gu8nGmdaz3/ju4/nJnZK7O8fcM2ZMcn0Hvom0/0Lor33N9BZxCvE/HlTh9P+doGqGsyuTe320vU63n7IekPwIZWssWXm8xeb2bPy9Iar/CX0msnwQvyRldunvgJ/Lf4vJUCCWb5ypO8r2RIsFq5IW9Ck++/5ppCzbHAag2P9vqrLM91ZrZR2j8amG9mnyqfS8eL4O/VQXhcqS+Y2YJIoA2eYWE8+OMPs7RWa2O3QdKTuHVRnYmo2VDAzPVwYb8xbqL6w24j/z5lLjA7VY1PSlbGpWa2ZXa8gg0Qy62m3NVsyOCkcftSOvdsXIW5B95Z/AIekK92De6e9XmaCoe7rHPS8Ld4yIjTcZO5Dh12kx9W0oy88ZV0M7CFldaESMPGy8ysEs665TPkvgZ/slL8fbUMUNfnXlua2aVp5PA6q/d6/nlZpaP+obKLfLd0+z7K59Q7pswcM9uglDYfH2WcClxJSbAnofdafP3oh/Feck+fgi4NefH9Km/I1WMVtHT/yu8gNw/FBgwKVyrrRnwu4wlJt+BWYxcX58zsRWl/X/y3ugAfMd/Ro8xuwR8/hIfE2DnLOyKTsYMi6ccMrRt+Om4xlt9/2I12zT072pcRLHdBp7Zt+6KhpXpXw7+H0/HR/lqD1mcyzzn0oizxnpPS9sF9FwrauJb/FO/BF3wd+LWkg+gMvfvFdG7oJvXrLgxVNvsDJbXKV/CYOnNwU7+6hVneWZPWFTXzH2js9aweobIlmXVOcN4jaRsrBQFLo5/yehaNV/hLPAv3ep6J95Z+iU/Oz83y/Bxf8Od+4KOSPpoXUDMSWA4P5X10qudVuOOUUfoegPfiPfrTcbPbbl65wq1JPoDPK0nSE/i60hV9tfosf5oJnVOB38o9s/+NWwMhD/OQL3rzTTxO18uBLTUUsqPoheeC/wcMBX98N/6OCDelvo5ORmoydlBein9PB+EqOLK6VP7bGplozqPVo86/w8btS+Jb+O+1hw2tFzOsek7akYO6O8gI2Nq6xNwfxv0qvYXUuH4UH/Ybrr/9co0Ou5FeOuX9HT4BdTE+Wbi5da4WVVe3fhZAhWqrr/+AGno9q2Go7JS2Pt4DvYROobMlPscxN8t7KnCh1a/w9xoz263HMy6GC4kvA5+2FAq9rUpHHrp8dzO7Kx1fh0fIXQoPl7xNlndFks4cn1f6EXCGVaOmfgjYAe/Z357Sngscgwvrcoei15yO5aMzSZvh4St+bUPzVM8DnmlDHr+NvfTzEVrqVPwVWLOsFkznx8TfaKToo/M3S2a16u5HJOA7ZjbQglp96tahDm/avqS8xXs4E+8wnQ68YzgjnMksHBr/4SX1tB0uq1G63K/8w73UzK5uUtc21OiJu/75VLIAwj0/6yyACvVDT/8BuRduoxXA2uhEJR2A92o2xJ2qwFVQJ9fMQ6yCe+E+Ts0Kf1bvTbwYbmE0E5iOC6LjbYAY9qm8DtWWskVn1MO/RtLq+DzQh3DDgB9k567Fhdt9pWum4Y16Y729Mmep0aD8zvV5B0dtzmE8UQunwZbldhsRCpiVzaMM3L6k93A3/P+wFD63Wo5z1ZdJq1YqGv/U0OVxgOoatl9S7S0YripYmWTypvpgW6Tryo5sx6Ye+2l4aI2uHq/yWP6/MbNbk3rhOIbMDWdZp0XC4hoK7wAe7mHBHzPrCRYWQFtZyQJIPkH2OTpp4j+wB3C03K6/MPWtXQGMLr4iXc6tjg+FX4DPk1yKr7i3JO4JvgDzsBBbqHOFv19azQp/AJJOSvnOwUcLlVX0BmD5Up3y1chqe4zpN5qJq7h+RTW0y6JlwZDKni83QmhD4UA5WmykzjWLuwV/BB9RTRrUcLXDQRv/BnQ1Gimda9y+lDGzu/E5mK8mNfXMQSo6mUcOi+DhG96FN7JFAK/v47b+XSNJJtXHx4D/wSdQv5nSe77oNTrz5+M9xd3wUNun4iqVO0r5bsTt5v8raQ9cbbMtSc9vZq/I8l5EvSAr6rB1ytfYAiilF0tfksoulr/s0Dmr+QpgeXkdp+iylKakZ+CjgC3wVck2Bx7qNgJpgnxivli2sWdAwRZlnowL87Jqax9cGM/M0j6Dj1puxv/I51qNpUyf3ncr1UydinMkebqOBmCByrDvaoeqzhMaHvLjEstCzwyzLj3VwU3bl5T3bXh7/oNS+tuBJ83slNb1m8TC4eu4BdKBNrRm7zL4ZO6/zWz/mmvWxa1WXoZL1hN7CZGW9dkQ/yHfgtvj56ZuubnhKbht/zfScXkIvylwl5n9NR3PwkcZd5A5NamFBVBKax0ZVr1XABukvGKpyi3T53L4YjGj1UsbCLln6s/wKLb5ZOBieMPy9yzvU3jguMJoYIFVE51CtzChrNyOUuTOlL+Xs9T1VhPnqCnqYy012eYRAOSmzsU7f3O3EaQarnao+nnCFYDt8P9hqzW7S/dprA7OrunavqTzVwLblAWN3EfqYjPbpG09J61aCQ9v8DzLpJuZPZK++Ftwsz0A5A48/4tP7HwJj+3STV3SGrm368q489ZSuFVIzlPyxU8exIfhh2fnyhY438FHNMidmr7AkFPTsQxFkuxmAbQNVQugrpFh5aGkZ+Kmi3n68ni4g93wP0XHMLxbeenaS3EBUBwfi3/3j+LmppcBX7OSmd5oooZmtwDm4Qm2UOeSi91UW41MBa1mofs+5Iu3lGndoUnqzMNwk9R+1lJlr+8ObJTNU9uQOhxn4VqDYm2IDST9GTd2eKR0yTKSFimP7pJqb8F/0TIjkVK+FUh+RQPWt606uEn7Aq62rIxAzOyfA6gtgcktHCwXDFli3Upc1+O28L/EncY2Vbb6lg0QsRBAHrN+Jr5e8Bz8hTnQzB4uZf0krk9cGDjbknWOfFK9HJ99YRsyp9sNONaGQonnZoQfBM6SVGsB1Kfe5XDJP03pbVYA60XZAGBNvNd9Kx5++G58icgxQe3MbheQhEHtXEeWp3U49pr6VZw2bRj26V04EH83XmolaylJB1qntdR4m6e24bP4f2trG1qveyF8be7D8Y5VTrfVDo+iutphBXNP9OF8L2+npA42j032FrydyqMFNG1fwOeFliqeKStjadyYoz02wNqiE2HDh/171qS/DW+A87R34Iuq1G5Zvje1uP9duGnmfqQ1mPvkXwRYvpS2FG5umKfdSFrTFx8BvTI/l+2vg9usv4s0+QTsldIqaw3jVkKHpTIvwf80d5by3Af8EB+VLTqM3+bPNWnCJ473xuPFzAZ+jU8ij/a7cgUwvSZ9OnDFMMt+FHikZnsUn+Bs9C7VvcfZ/palc/sNUM9rgZVq0qdRXce8dr3mibjh5p11654vQs3a3Cn9iPSuX5O2+Smt7zuPx1+7cBj1vaXJuQHal4NwQ4jnZGnT8Q7xRwap62QeOeyLB4R7F5095yXw3u8CrBT+ugfvkUdm3df69whf3iAPQIfNdOp0FJNb11maL8lo6tR0JB5m4vjSvTZI5zrCTOBC4XfATjYULvnAUp41rMuqb72eqXyKGmc187f1RnloiofTthM+kuvpBzICLGM1k3hmdkeapxoYM+sWhbVVMTVpH8IFNbgDWz4H8C7c6akNbaylJsOIoeBxqzEAMPcYf6wm/8Z4TK9P4x2srfD/ypL4HGYxpzeH6u+yAu4f1DOuWB+aqoMbty8AZvYVeTDLi9NISHgH5QgzO2aQik5a4WBuw/6ykl74nPKXDqCGi9ab2faS3gycJ+lE3EHpqSzfI9l+G3VCuaEGf9FeLGkvy3TZZna4pAsYcmoq6r0QnUPkVawmXoqZzZFbY5XpGy65qWBI1D1TQUekTEkfxPWsW+D68svSdjw+XB5t2pjdjgo9dPjCVTh16XX7dcdN6BW1s3xuMpmnlk2/C4SrMsv8PzxQ4b/TvNoh1M/p7VS6zvCw2HVGBW1opA5u2b4U13wH+E5SJVHT8WzFZLZWWhwPXbAO3sAcV9eDSHnbeshugA/pHsWFQ2F5MtBCHN1IFj+nW4MIizXX3mpm63Y5N8+yyJmlc0W45Jl4WPOTqA+XPGJI+hru23CZJSussUQDmN2OQh0ae8mn/CO9OFMra6nJgnqvmIZV11K/3lJ0YfUIVCjppbga7lel63cA7rXBF/5aB/dgfh5DndqbgD/gob//NGC5I256O5mFw4/wXujv8LAEd5hZ15WbupTRsWC63A7/ELzhrF2WsmG5lYmhHnkHMhvUMMJMZHk7wiWrdyTK95WHp/LQCssX6or0/b0DnzR7QdtnGi00gNntWNJlIrEQaLlPCoyhQHs6ouaBCi8E3ll+N9K79H0rLdPZ4v6tog63KHfkTW8HmaiYCBtuH59PMjWaRMMn4N6PC5U/4aucFeduxk1dl2xY1mqk0A7peGXcMe8vDa9/PnD5gM+/Cq6a+Q1DE9K/xcNUPKsm/+K4xc638ACEdZN4twGb1KR/uvz94iqqh3Ed7G9xp7678dAXLxnv96Phd7gQ8NYxvF/j9wUPFtl1G+F6VQwIJtOWvsdP4+bWZ6T92klc3KT9Utz89VqGOsjrAJdm+a7ucb8bhlHXXuXOKR3vgMdYuy9tvwV2bHm/FZq2jeVt0s45kNl6m/cCumZU/YLpa1nVkWhXaxh+QR4v6H/xHt1ikr6NR0w8CXeYyvPWzXmsgM8rvK3J/cpYyzAT+HoQ+UhrfTJfkMSuwI/lK01dnkz2jsGF2FalvIfigmSePHTE5cCbrSYo2Hij7iGoP4ybD548BnVo/L5AT7+UhfCR7UiOdibTBHQHarlimjWf0+sIoVJiyWFUuVHUYXnInX3wwHtFWI0ZwBGSVjezY5vczIZhejuZ1Uq5DjVfuKMSNkENF0xX99hK4IXmVkc34RYFD8gD+/0RNzms6CJr5jwMDx99q5VWqhot1BlpcxHgKqtRZ8lXwzsTb0zfk5L3MLPHSvnKevAFQ/KJhqSzGApBvQ3e0xSwv1VDUI9WHRq/Lyl/T4Fm2ZoKI1C3gRe2H280AiumdSn3O/h/9NBCgKRG9tP4yHzvXtf3KLeROjh/X0r5VsTbsEZq29R5/IQNoAabtCMHa+dx2nTB9MI8sOgxv7dHmf8pfjgz+7OkP3T7o1v9Kl8rMYCn6zDoO9KSe3/ejZvq/Qz3BN0PWCrpxfMXtexFu5zq106eCDw3E4zfo0cI6lGk8fuS6Lamwi6DCLQBrKUmC8uUBQOAmV1XWO0MyIeB7wHzNOR8uiHei3/3MMo9ADhT0lupiTqc5VNZMACYrz9eKbSP6W3TtbA7y5ysI4dB0NCC6TOBdXH7+vKC6UXensHHJN1Lpwv97vmxZV7X8nj7R+A21J/F//gr4UPZPc3s3GE8ViOajLQ0FOu+yANDIRwsH2m1tb4ZT0bC2mcE6tD4fUn5G6+p0PD+k+b3aoNGf0XG55KtbWIp5MVwKamD55bVwfJYSXtbdR2VDYHvmtmmpfSy0cWwTW+nlHDIkcdbmolb6lTMPvs1IOq9GAtmdmKWdzbe81sWt6XewcyukK99e2ovIRQMnzYqyFGsQ+P3JeUfM4HWxrpuoiFfHvM9uIdwecW0483s/41X3YaDPObZyXiU6XyEMQv3nq9d6lYNAxA2qsNUEA6Sfm1m2zbIlzcSv8NDUSwYw1k1iFe3cjoCe5Xsp2/O9YX9RihjiRrGus/Sd8BVdkXI7bl4dMlzxqK+kxU1W7lvxAWapNXwidgbzOxxefTZA/AVw57dtryJgjpXTAN/D2tXTJtMSHoWblmZ+0McbfULXtUGIAS6BSDsTxOTpsm+UYod0yPfXenLvCvbiuM/l/Jeku3/oHSubPb5+ybnxnvDTfym1aSvRMnkFu+tzcYd6ZZJ29b4xP/e4/0spbqu0Gsbw3q8L71P96ftTuD9Y3TvA/AYQpfjPex3pzp8HVh1vH+j2Ib9+x6FL1ewUJa2EG6a/81Bypy0E9ItWVbdYwFhZkU0xs3NV1FqQu6E9MLSufKM0YYaWkmrWFWryLd4w/uNBYtZTYx/M7tP7lmdcyBVa4oL02jiElx9NlHoFf7aKCne75YAAAzaSURBVC1CPxpogFDNI8zewPOtobXUZCK9cwfTOXIY0RGspPXN7Ka0v5mZXTFSZXe530V0t5w0y9YxT/wPaRngLNNTkj7OgCFqpoxwwGOldGscCuFwNp0BznrRSx/Xcc7ax/IfLxrFui+SrYU1xXhiIx/+ehAah2oeJdpaS00KRsofoAFfkkcUOAsfdT2vT/7hUrcM7Gb4c9at59A2AGFfpopwuNPM3tUgX5tWbTlJb8CHbstlIxPhwmgy0i3W/Teoxrp/RDXhNpI1xbACfo00kt5mZj9M+1ta5hiVnrVthNNBMKuxNDIPAPdU3QUjzOqSjsqOV82PbcA1TSYAozKClQevfMCSrt7MdpL0AVx1s8ewatyAXHAnP6lP4FqG91op3lOibQDCvkwV4dC00V9NHiSuFjPLbcUvBl6f9n9LZ5TSi9tVb8JwKN6DvVNS4YG7JnAc/nLmfBg4W1KtNcUY1LUNIx3+ehC6hWrempqV+0aBj5SOJ/2oITFaI9if4HNofhOPLLwbHr316HR+VEkGIofiy9Uebma9ggz+DejWdlUmsJswVYTD2+sSlUIRmFkRPuHfuL6yL2b2jpGp2sQhDUsPllTEugeYl3q3i5byXiLpZbg1xTtS8k3AZlZjTTHOjHT460H4AC5MW6/cNxJYp2l1X2upScRojWCfYWnFNUmfx9eBeI2Z/StZBo0qkq7G48B9GTciQB6mBgAz+32e38y2Guk6TBXh8GdJh9A/ts79ZnZckwIlHWkpCqyk/c3sG9m5Eyaz8DBf12GOvOu1taQ98DmbVYo8knbBnYw+OU7VbIN12a87Hi22Bd6Ke9kW+uqLgX3q1E2jgUoL28sXh+m5sP0kYLRGsPNSmavjguH5STCMVbThfwL/wNeXeHPpnJGNagAkfdTMvpT2dzWzH2fnPm9mH29bgani59Aoto6kq83spV3KeLaZ/SU7HtF4+xMJuUf3HvjatSvgMX7OtswLVdIZwOa47f1lDK3XMLDTzWihCRD+WtJXcGulF+B26JeSFj2qU4uMwv0La6n9ytZSwJVjYC01arTxB2hR5mJ4IMrH8WjF38dNgdfDlxY+v8flY85otEdTRTgMOxSBSsHJcue1siPbZBUOafi8K26LfyoegG92L2sfSWvhQmKL9LkmHpZ4x9GvcTM0gdZzkK95MYOh72tz4CEzW7/nhcO/7x8oWUul9CXwQH6jbX0zqZEvLrYBHizzoTG4X1fTe+gwvy/y92qPBnK0nSpqpTzo3JOS7h5gKF/WTS+UTNsWyvaLPJPFdLXMu3H792OAn5vZY5J69h7M7PbUy1oibYtTs4b0OPMGvJf++zpzvzFmCdxhcNm0/YWxWSp1vK2lRgVJOwOrm9nR6fhKXFcP8LFcvTIc0nd39UiU1ZBei/7k5vd5Wt1+3XEjpopwKJzQoNMRrU0ogvIXvCyu4ywEwu975J0srAq8Bo85dWRyxFmii+/Dx/Fe7zR8icMrcKufvc3sybGtdl9WB44E1pNHrxxTlQ6ApGNxtcejwJXp/l+zUsC4UWS8raVGi4/iQQwLFgNeis+rfB8YEeEwDvy8PDrow4g72k4J4dDUCU3SN6lv2EVpkQ4zmz78mk0sUqN+LnBuGg3shPd075F0gZnl9t174pNmP8cbuisL646JhpkdBBWVzjuBYyWNukonsSbecN0K3IOHRh919URGo4XtJyHPMLO7suNLzOx+4H5VvfonE4dSHR10ZTQcbafKnMPWlkLiSlrLsgW3Jb2xkNBqF2m1PKdgwH2lF/VpgTwg4c5m9oNS+gp4Q7sF7r35TNz66zIz+/6YV7QPyQRxc7xB3BwX+HPM7J1jdH/ho4fiO3sRHsb9cjPrGVJ7hO6/OG5okE/cnjxW1lKjgaR5VhNVOZ37k5mtPWC5SwL/NbP/puPnAzviDrVtevQDMRHmLaeKcBjWTH76U72uZB5W55CyAr5ox0wboxXGxoryhHzp3CJ4mORX4qEM1hqNnsyg1Kh0rgCuGEOVTrk+q+MCagt8dLaimfVaPnIk7nkArk67dgLMu4wYkk4GfmPVldX2weNYzRyw3IuBvczsVknr4AElT8YjEF9lZocMs+r97l9Y2FVO4arwF4/m/WGKqJUYwAkqWTVth+vft8VDeC8QDmb26i7XzcAjJL5yGPWdiHR8T5JejzduW+IN71y88fkwrmaaSIy3SqfwsC1GDP8lzXkAxzM2E9Kr42ar4zbvMkocCPws+eLk6zkshptiD8ryZnZr2p+Fr7vygaSavAb3FxlNbqf3pPSoM1WEQ+OZfHkckz3wIeRVeOO3lpn9q9GNzGYXHqhPM8rf2zvwBuajwDU2RmthD4KZbV9S6XwYeJGkMVPpANPxzsWBZjbmE8ATZN5lxDGze4Et0sR6oS77pZVWVhuk6Gx/a9xTGfN1MMbCuuvxsTSxrmOqCIfnSjqb5PSU9knHC2z4Jd2N2/gfAxxkZo9Kur2pYEhlrMIktVZS/Tq04N/TKnmCmfW0w55omOtPb5T0EPBw2nYCNsWXix3t+3dbw3msGS9T2lElCYPhCoScG5Lj4j14KJlfA0gaVfVfxqXdTkhaxcz+PtoVmCpzDq/qdd7MfpvyHYkPRW8ETsHD886xbO3krMw6y6ZignZ/m4SrUE0kZ7GRpIdK5zL89520dv5NmWjzLhOd5By4P27efbyl2E2StgDWLhtnjEF9lgPehGs1XmBjsHLfVBEOjWMdJfXDVvhcw45472ov4BzLApXVWDYZvrLW1Wmo+7RA0kp4zKlJ+6LII+0W4T0ms03/wEg6F1/R70ZcKF4O3DiZf9fRpE2bMYp1WAI3M94Dj++0NN55vXgsOjRTRTgMFlvEI5EWk9LbmdlKNXmWZCiC6R/MbKCFNSYC8phKR+DmlZ8FfoA3KAsBe5rZuVneZazLurSS1jSzP49BlYMWjLcp7WRivE1JJZ0CvAJXZ52Gq8zm9QplM9JMlTmHJVW/EAYwFP623OAlG+dfAL9QKRpjEhxfxsOB35HKXkXSN83sCEkbTUJz1m8BH8dHSxcCO5jZFZLWw2MtnZvl/Q1pXYTkIJcvW/gzmq+oF4wR4z3vMslo1GaMIuvjwUJvBm5OYX/GtCc/VYTDasBXqf+h8/C3v6F7g3cynQ3eV4Elgelm9mi6ZhngK5KOAbYnm+yeJCxiZsXE22csrZNrZreounBKnrBCj3PBBGACmNJONpq2GaOCmW2UOmUzgf+TdB+w9FhNRsPUEQ7zzKzJj9mmwdsRWDfX2ZrZI/KY+fcBOwxU0/El12P+u3SulwnweK2REDRnOuNoSjsJadpmjBpmdgs+ojtM0ia4oLhaHjh0i9G+/1QRDk1p0+A9VTeZl4Z/84te9ySjTfCulSV9KJ0r9ou80wgmFBPIlDYYAPM1pa+R9FE87tKoM1WEw0cb5mvT4N0kaU8zOylPlPQ2XE846WgZ8uK7uPVEeR/geyNWqSAYHz7W7YSkLc2sqx/CaGJmT0l6N/CZ0b7XVLFWuojuqg4r5hYk9ZyUM7NPZ2WuhkdN/DedUS6XAN5gZvcMt95BEIwPKXzOW/C5h3PN7EZJO+EGG0vYAIvnjGDd7jKzNUb9PlNEOGxSk7wZPqK417osDdqw7Nxt/yYrxct/uiLpqF7nzeyDY1WXIBhpJJ0ArIGH0HkZ7kk+AzjYzH42jlXrGQRzRO8zFYRDTvKW/gSuQz/czH6VnYsGryElJ8BPUzKFtCy8eRBMNiTNBTZIapzFgb/hntH3j9H9H6V7KJslzGzUpwSmypwDkrbDJ3Iew4VCXcjta7L9SoMXDGGda1scEMIgeJrxWOGFbGb/kXTbWAmGdM+l++caXabEyEHS1fiE8pfxsAEd1Dm0aMBFuaci4+1NGgQjTWk9BQFrp+MxW09hvJkqI4d/Av8A3py2nG4OLU9/qRkEQTde0D/L05spIRzMbKvxrsPTDUn/BJ5Mh0uWfCLMzJYZn5oFwfDpFoFY0stxZ7R9x7ZGY89C412BsSA5jhT7u5bOfT7bf1TSI6mhe3GxX6SPYZUnA380s2XStki2v3QIhuDphKSNJX1Z0h14QMpbxrlKY8JUmXMY1hrSQZX43oKnM5Keh48QZuLhcH6ELwDWc82TpxNTQq3EAGtIB33JPcgrmNnXxrIyQTDC3IKvG7+Tmc0DkHTg+FZpbJkqwiGCxI08CwPPJIRr8PTkjcDuwEVpoaTTmGLv+lRRKz2JWywJD29RrAktYHEzW3S86jZZCbVSMBWQtBS+GttM3KrxJODMIrT905kpIRyCkSf8QIKphqTlgV2B3UprvTwtCeEQDISkFczsgfGuRxAEo0MIhyAIgqDClPBzCIIgCNoRwiEIgiCoEMIhCIIgqBDCIQiCIKjw/wGBLAq+ey+K4wAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df['category'].value_counts().plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Texts for Classification\n", "\n", "These are some of the fields we can use for the classification task. We create 3 different versions." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def tokenize_url(url:str): \n", " url=url.replace(\"https://www.huffingtonpost.com/entry/\",\"\")\n", " url=re.sub(\"(\\W|_)+\",\" \",url)\n", " return url\n", "\n", "df['tokenized_url']=df['link'].apply(lambda x:tokenize_url(x))\n", "\n", "#just the description\n", "df['text_desc'] = df['short_description']\n", "\n", "#description + headline\n", "df['text_desc_headline'] = df['short_description'] + ' '+ df['headline']\n", "\n", "#description + headline + tokenized url\n", "df['text_desc_headline_url'] = df['short_description'] + ' '+ df['headline']+\" \" + df['tokenized_url']\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def _reciprocal_rank(true_labels: list, machine_preds: list):\n", " \"\"\"Compute the reciprocal rank at cutoff k\"\"\"\n", " \n", " # add index to list only if machine predicted label exists in true labels\n", " tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]\n", "\n", " rr = 0\n", " if len(tp_pos_list) > 0:\n", " # for RR we need position of first correct item\n", " first_pos_list = tp_pos_list[0]\n", " \n", " # rr = 1/rank\n", " rr = 1 / float(first_pos_list)\n", "\n", " return rr\n", "\n", "def compute_mrr_at_k(items:list):\n", " \"\"\"Compute the MRR (average RR) at cutoff k\"\"\"\n", " rr_total = 0\n", " \n", " for item in items: \n", " rr_at_k = _reciprocal_rank(item[0],item[1])\n", " rr_total = rr_total + rr_at_k\n", " mrr = rr_total / 1/float(len(items))\n", "\n", " return mrr\n", "\n", "def collect_preds(Y_test,Y_preds):\n", " \"\"\"Collect all predictions and ground truth\"\"\"\n", " \n", " pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)]\n", " return pred_gold_list\n", " \n", "def compute_accuracy(eval_items:list):\n", " correct=0\n", " total=0\n", " \n", " for item in eval_items:\n", " true_pred=item[0]\n", " machine_pred=set(item[1])\n", " \n", " for cat in true_pred:\n", " if cat in machine_pred:\n", " correct+=1\n", " break\n", " \n", " \n", " accuracy=correct/float(len(eval_items))\n", " return accuracy\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import precision_recall_fscore_support\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n", "\n", "import numpy as np\n", "import logging\n", "\n", "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", "\n", "def extract_features(df,field,training_data,testing_data,type=\"binary\"):\n", " \"\"\"Extract features using different methods\"\"\"\n", " \n", " logging.info(\"Extracting features and creating vocabulary...\")\n", " \n", " if \"binary\" in type:\n", " \n", " # BINARY FEATURE REPRESENTATION\n", " cv= CountVectorizer(binary=True, max_df=0.95)\n", " cv.fit_transform(training_data[field].values)\n", " \n", " train_feature_set=cv.transform(training_data[field].values)\n", " test_feature_set=cv.transform(testing_data[field].values)\n", " \n", " return train_feature_set,test_feature_set,cv\n", " \n", " elif \"counts\" in type:\n", " \n", " # COUNT BASED FEATURE REPRESENTATION\n", " cv= CountVectorizer(binary=False, max_df=0.95)\n", " cv.fit_transform(training_data[field].values)\n", " \n", " train_feature_set=cv.transform(training_data[field].values)\n", " test_feature_set=cv.transform(testing_data[field].values)\n", " \n", " return train_feature_set,test_feature_set,cv\n", " \n", " else: \n", " \n", " # TF-IDF BASED FEATURE REPRESENTATION\n", " tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)\n", " tfidf_vectorizer.fit_transform(training_data[field].values)\n", " \n", " train_feature_set=tfidf_vectorizer.transform(training_data[field].values)\n", " test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)\n", " \n", " return train_feature_set,test_feature_set,tfidf_vectorizer\n", "\n", "def get_top_k_predictions(model,X_test,k):\n", " \n", " # get probabilities instead of predicted labels, since we want to collect top 3\n", " probs = model.predict_proba(X_test)\n", "\n", " # GET TOP K PREDICTIONS BY PROB - note these are just index\n", " best_n = np.argsort(probs, axis=1)[:,-k:]\n", " \n", " # GET CATEGORY OF PREDICTIONS\n", " preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]\n", " \n", " preds=[ item[::-1] for item in preds]\n", " \n", " return preds\n", " \n", " \n", "def train_model(df,field=\"text_desc\",feature_rep=\"binary\",top_k=3):\n", " \n", " logging.info(\"Starting model training...\")\n", " \n", " # GET A TRAIN TEST SPLIT (set seed for consistent results)\n", " training_data, testing_data = train_test_split(df,random_state = 2000,)\n", "\n", " # GET LABELS\n", " Y_train=training_data['category'].values\n", " Y_test=testing_data['category'].values\n", " \n", " # GET FEATURES\n", " X_train,X_test,feature_transformer=extract_features(df,field,training_data,testing_data,type=feature_rep)\n", "\n", " # INIT LOGISTIC REGRESSION CLASSIFIER\n", " logging.info(\"Training a Logistic Regression Model...\")\n", " scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)\n", " model=scikit_log_reg.fit(X_train,Y_train)\n", "\n", " # GET TOP K PREDICTIONS\n", " preds=get_top_k_predictions(model,X_test,top_k)\n", " \n", " # GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS - for ease of evaluation\n", " eval_items=collect_preds(Y_test,preds)\n", " \n", " # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?\n", " logging.info(\"Starting evaluation...\")\n", " accuracy=compute_accuracy(eval_items)\n", " mrr_at_k=compute_mrr_at_k(eval_items)\n", " \n", " logging.info(\"Done training and evaluation.\")\n", " \n", " return model,feature_transformer,accuracy,mrr_at_k\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train a Single Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model - 1 (binary features with description only)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:41:33,612 : INFO : Starting model training...\n", "2019-11-25 12:41:33,739 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:41:36,742 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:44:30,134 : INFO : Starting evaluation...\n", "2019-11-25 12:44:30,202 : INFO : Done training and evaluation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Accuracy=0.5980542754736303; MRR=0.48048941798943345\n" ] } ], "source": [ "field='text_desc'\n", "feature_rep='binary'\n", "top_k=3\n", "\n", "model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k)\n", "print(\"\\nAccuracy={0}; MRR={1}\".format(accuracy,mrr_at_k))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model - 2 (tfidf features with description only)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:44:30,242 : INFO : Starting model training...\n", "2019-11-25 12:44:30,308 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:44:33,389 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:45:21,446 : INFO : Starting evaluation...\n", "2019-11-25 12:45:21,515 : INFO : Done training and evaluation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Accuracy=0.6306323604710702; MRR=0.5108380269670774\n" ] } ], "source": [ "field='text_desc'\n", "feature_rep='tfidf'\n", "top_k=3\n", "\n", "model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k)\n", "print(\"\\nAccuracy={0}; MRR={1}\".format(accuracy,mrr_at_k))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model - 3 (tfidf features with description, headline, url)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:45:21,554 : INFO : Starting model training...\n", "2019-11-25 12:45:21,620 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:45:27,755 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:46:27,562 : INFO : Starting evaluation...\n", "2019-11-25 12:46:27,634 : INFO : Done training and evaluation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Accuracy=0.8672555043522785; MRR=0.7511520737327071\n" ] } ], "source": [ "field='text_desc_headline_url'\n", "feature_rep='tfidf'\n", "top_k=3\n", "\n", "model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k)\n", "print(\"\\nAccuracy={0}; MRR={1}\".format(accuracy,mrr_at_k))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check Predictions on Unseen Articles from CNN (not HuffPost our training data)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['POLITICS', 'CRIME']]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://www.cnn.com/2019/07/19/politics/george-nader-child-porn-sex-charges/index.html\n", "test_features=transformer.transform([\"George Aref Nader, who was a key witness in special counsel Robert Mueller's Russia investigation, faces new charges of transporting a minor with intent to engage in criminal sexual activity and child pornography\"])\n", "get_top_k_predictions(model,test_features,2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['ENTERTAINMENT', 'STYLE']]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://www.cnn.com/2019/07/18/entertainment/khloe-kardashian-true-thompson-video-trnd/index.html\n", "test_features=transformer.transform([\"True Thompson makes an adorable cameo in Khloe Kardashian's new makeup tutorial video\"])\n", "model.predict(test_features)\n", "get_top_k_predictions(model,test_features,2)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['ENTERTAINMENT', 'STYLE']]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://www.cnn.com/2019/07/12/entertainment/heidi-klum-tom-kaulitz/\n", "test_features=transformer.transform([\"Heidi Klum is apparently the latest celeb to get married and not tell us\"])\n", "get_top_k_predictions(model,test_features,2)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['BUSINESS', 'POLITICS']]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://www.cnn.com/2019/07/19/investing/dow-stock-market-today/index.html\n", "test_features=transformer.transform([\"Stocks end lower as geopolitical fears rise. The Dow and US markets closed lower on Friday, as geopolitical worries overshadowed the hopes of interest rate cuts by the Federal Reserve.\"])\n", "get_top_k_predictions(model,test_features,2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['SCIENCE', 'HEALTHY LIVING']]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://www.cnn.com/2019/07/19/health/astronaut-exercise-iv-faint-scn/index.html\n", "test_features=transformer.transform([\"Exercise in space keeps astronauts from fainting when they return to Earth, study says. \"])\n", "get_top_k_predictions(model,test_features,2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train Different Types of Models" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:46:27,728 : INFO : Starting model training...\n", "2019-11-25 12:46:27,788 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:46:30,778 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:49:25,346 : INFO : Starting evaluation...\n", "2019-11-25 12:49:25,419 : INFO : Done training and evaluation.\n", "2019-11-25 12:49:25,462 : INFO : Starting model training...\n", "2019-11-25 12:49:25,523 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:49:28,496 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:53:27,625 : INFO : Starting evaluation...\n", "2019-11-25 12:53:27,701 : INFO : Done training and evaluation.\n", "2019-11-25 12:53:27,735 : INFO : Starting model training...\n", "2019-11-25 12:53:27,797 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:53:31,055 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:54:17,419 : INFO : Starting evaluation...\n", "2019-11-25 12:54:17,493 : INFO : Done training and evaluation.\n", "2019-11-25 12:54:17,527 : INFO : Starting model training...\n", "2019-11-25 12:54:17,606 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:54:22,294 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 12:57:33,965 : INFO : Starting evaluation...\n", "2019-11-25 12:57:34,034 : INFO : Done training and evaluation.\n", "2019-11-25 12:57:34,072 : INFO : Starting model training...\n", "2019-11-25 12:57:34,132 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 12:57:38,488 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 13:02:22,456 : INFO : Starting evaluation...\n", "2019-11-25 13:02:22,513 : INFO : Done training and evaluation.\n", "2019-11-25 13:02:22,546 : INFO : Starting model training...\n", "2019-11-25 13:02:22,594 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 13:02:27,275 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 13:03:19,438 : INFO : Starting evaluation...\n", "2019-11-25 13:03:19,507 : INFO : Done training and evaluation.\n", "2019-11-25 13:03:19,543 : INFO : Starting model training...\n", "2019-11-25 13:03:19,601 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 13:03:25,400 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 13:06:27,931 : INFO : Starting evaluation...\n", "2019-11-25 13:06:28,002 : INFO : Done training and evaluation.\n", "2019-11-25 13:06:28,057 : INFO : Starting model training...\n", "2019-11-25 13:06:28,127 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 13:06:34,953 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 13:11:21,625 : INFO : Starting evaluation...\n", "2019-11-25 13:11:21,697 : INFO : Done training and evaluation.\n", "2019-11-25 13:11:21,746 : INFO : Starting model training...\n", "2019-11-25 13:11:21,805 : INFO : Extracting features and creating vocabulary...\n", "2019-11-25 13:11:28,276 : INFO : Training a Logistic Regression Model...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LibLinear]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2019-11-25 13:12:26,150 : INFO : Starting evaluation...\n", "2019-11-25 13:12:26,222 : INFO : Done training and evaluation.\n" ] } ], "source": [ "feature_reps=['binary','counts','tfidf']\n", "fields=['text_desc','text_desc_headline','text_desc_headline_url']\n", "top_ks=[3]\n", "\n", "results=[]\n", "for field in fields:\n", " for feature_rep in feature_reps:\n", " for top_k in top_ks:\n", " model,transformer,acc,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k)\n", " results.append([field,feature_rep,top_k,acc,mrr_at_k])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Results of Various Models" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text_fieldsfeature_representationtop_kaccuracymrr_at_k
8text_desc_headline_urltfidf30.8672560.751152
6text_desc_headline_urlbinary30.8301650.715587
7text_desc_headline_urlcounts30.8296530.718131
5text_desc_headlinetfidf30.8359250.717171
3text_desc_headlinebinary30.7946750.679169
4text_desc_headlinecounts30.7921790.677894
2text_desctfidf30.6306320.510838
0text_descbinary30.5980540.480489
1text_desccounts30.5955260.478436
\n", "
" ], "text/plain": [ " text_fields feature_representation top_k accuracy mrr_at_k\n", "8 text_desc_headline_url tfidf 3 0.867256 0.751152\n", "6 text_desc_headline_url binary 3 0.830165 0.715587\n", "7 text_desc_headline_url counts 3 0.829653 0.718131\n", "5 text_desc_headline tfidf 3 0.835925 0.717171\n", "3 text_desc_headline binary 3 0.794675 0.679169\n", "4 text_desc_headline counts 3 0.792179 0.677894\n", "2 text_desc tfidf 3 0.630632 0.510838\n", "0 text_desc binary 3 0.598054 0.480489\n", "1 text_desc counts 3 0.595526 0.478436" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_results=pd.DataFrame(results,columns=['text_fields','feature_representation','top_k','accuracy','mrr_at_k'])\n", "df_results.sort_values(by=['text_fields','accuracy'],ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save Model for Future Use\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "\n", "model_path=\"../models/model.pkl\"\n", "transformer_path=\"../models/transformer.pkl\"\n", "\n", "# we need to save both the transformer -> to encode a document and the model itself to make predictions based on the weight vectors \n", "pickle.dump(model,open(model_path, 'wb'))\n", "pickle.dump(transformer,open(transformer_path,'wb'))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use Loaded Model" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['POLITICS', 'THE WORLDPOST']]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loaded_model = pickle.load(open(model_path, 'rb'))\n", "loaded_transformer = pickle.load(open(transformer_path, 'rb'))\n", "\n", "\n", "test_features=loaded_transformer.transform([\"President Trump AND THE impeachment story !!!\"])\n", "get_top_k_predictions(loaded_model,test_features,2)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }