Resources Used: \ nltk preprocessing https://colab.research.google.com/github/gal-a/blog/blob/master/docs/notebooks/nlp/nltk_preprocess.ipynb#scrollTo=0JzUMH4jdXm7 \ towards data science, getting started with text analysis in python https://towardsdatascience.com/getting-started-with-text-analysis-in-python-ca13590eb4f7 \ geeks for geeks text analysis in python 3 https://www.geeksforgeeks.org/text-analysis-in-python-3/ \ towards ai, text mining in python https://towardsai.net/p/data-mining/text-mining-in-python-steps-and-examples-78b3f8fd913b \ github, python for text analysis course https://github.com/cltl/python-for-text-analysis \ a beginners guide to sentiment analysis https://towardsdatascience.com/a-beginners-guide-to-sentiment-analysis-in-python-95e354ea84f6
# setup
#!pip install -q wordcloud
import wordcloud
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string
from collections import Counter
# check if gpu available for processing
from tensorflow.python.client import device_lib
def get_available_devices():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos]
print(get_available_devices())
import tensorflow as tf
tf.config.list_physical_devices('GPU')
tf.test.is_built_with_cuda()
['/device:CPU:0']
True
# constants and strings
# POS (Parts Of Speech) for: nouns, adjectives, verbs and adverbs
DI_POS_TYPES = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
POS_TYPES = list(DI_POS_TYPES.keys())
# constraints on tokens
MIN_STR_LEN = 2
RE_VALID = '[a-zA-Z0-9]'
# constraints for memory usage
ITER = 0
SAMPLE_STEP = 5
# read data from source
review_col_list = ["UserID", "ProductID", "Date", "Review"]
dfReviews = pd.read_csv("../YelpData/YelpNYC/ReviewMap.csv", usecols=review_col_list)
ratings_col_list = ["UserID", "ProductID", "StarRating"]
dfRatings = pd.read_csv("../YelpData/YelpNYC/starRatingMap.csv", usecols=ratings_col_list)
ratings_col_list = ["UserID", "ProductID", "FakeReview"]
dfMeta = pd.read_csv("../YelpData/YelpNYC/metaData.csv", usecols=ratings_col_list)
dfReviews = pd.merge(dfReviews, dfRatings, how = 'inner', left_on = ["UserID", "ProductID"], right_on = ["UserID", "ProductID"])
print(df.describe())
print(df.head(10))
df = pd.merge(dfMeta, dfReviews, how = 'inner', left_on = ["UserID", "ProductID"], right_on = ["UserID", "ProductID"])
print(df.describe())
print(df.head(10))
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
# remove accents function
def remove_accents(data):
return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")
UserID ProductID FakeReview
count 359052.000000 359052.000000 359052.000000
mean 53992.205533 459.929601 0.794542
std 45806.707721 259.923732 0.607210
min 923.000000 0.000000 -1.000000
25% 13840.000000 247.000000 1.000000
50% 40523.000000 468.000000 1.000000
75% 87314.000000 672.000000 1.000000
max 161147.000000 922.000000 1.000000
UserID ProductID Date \
0 923 0 12/8/2014
1 924 0 5/16/2013
2 925 0 7/1/2013
3 926 0 7/28/2011
4 927 0 11/1/2010
5 928 0 9/2/2009
6 929 0 8/25/2009
7 930 0 5/20/2007
8 931 0 12/27/2005
9 932 0 5/9/2014
Review FakeReview
0 The food at snack is a selection of popular Gr... -1
1 This little place in Soho is wonderful. I had ... -1
2 ordered lunch for 15 from Snack last Friday. ... -1
3 This is a beautiful quaint little restaurant o... -1
4 Snack is great place for a casual sit down lu... -1
5 A solid 4 stars for this greek food spot. If ... -1
6 Let me start with a shout-out to everyone who ... -1
7 Love this place! Try the Chicken sandwich or ... -1
8 My friend and I were intrigued by the nightly ... -1
9 Stopped in for lunch today and couldn't believ... -1
UserID ProductID FakeReview StarRating
count 359052.000000 359052.000000 359052.000000 359052.000000
mean 53992.205533 459.929601 0.794542 4.025871
std 45806.707721 259.923732 0.607210 1.055061
min 923.000000 0.000000 -1.000000 1.000000
25% 13840.000000 247.000000 1.000000 4.000000
50% 40523.000000 468.000000 1.000000 4.000000
75% 87314.000000 672.000000 1.000000 5.000000
max 161147.000000 922.000000 1.000000 5.000000
UserID ProductID FakeReview Date \
0 30262 468 1 10/20/2004
1 107234 510 1 11/2/2004
2 19015 142 1 12/9/2004
3 116117 708 1 3/2/2005
4 59929 454 1 3/7/2005
5 12087 482 1 3/11/2005
6 88647 444 1 3/13/2005
7 25179 80 1 3/19/2005
8 4912 120 1 3/24/2005
9 25178 363 1 3/31/2005
Review StarRating
0 Excellent Soup Dumplings. It's a must if you g... 4
1 One of the best hidden no-name neighborhood pl... 4
2 Really lovely Italian food, very simple and we... 5
3 Mario Batali at his best, this is my current f... 5
4 Best place for brunch if you can handle the wa... 5
5 This cozy, causal restaurant is localed in the... 3
6 Take a bottle of wine, order the mussels, soak... 5
7 moto is circa 1938, dusky mirrors and heavy cu... 5
8 after all the hype i gotta say that some of it... 3
9 If you want to feel like you're in the middle ... 5
# build sentiment into table
df['Sentiment'] = df['StarRating'].map({1 : -1, 2 : -1, 3 : 0, 4 : +1, 5 : +1})
print(df.head(10))
#separate sentiment into 3 separate frames
dfPositive = df[df['Sentiment'] == 1]
dfNeutral = df[df['Sentiment'] == 0]
dfNegative = df[df['Sentiment'] == -1]
print(dfPositive.describe())
print(dfNeutral.describe())
print(dfNegative.describe())
UserID ProductID FakeReview Date \
0 30262 468 1 10/20/2004
1 107234 510 1 11/2/2004
2 19015 142 1 12/9/2004
3 116117 708 1 3/2/2005
4 59929 454 1 3/7/2005
5 12087 482 1 3/11/2005
6 88647 444 1 3/13/2005
7 25179 80 1 3/19/2005
8 4912 120 1 3/24/2005
9 25178 363 1 3/31/2005
Review StarRating Sentiment
0 Excellent Soup Dumplings. It's a must if you g... 4 1
1 One of the best hidden no-name neighborhood pl... 4 1
2 Really lovely Italian food, very simple and we... 5 1
3 Mario Batali at his best, this is my current f... 5 1
4 Best place for brunch if you can handle the wa... 5 1
5 This cozy, causal restaurant is localed in the... 3 0
6 Take a bottle of wine, order the mussels, soak... 5 1
7 moto is circa 1938, dusky mirrors and heavy cu... 5 1
8 after all the hype i gotta say that some of it... 3 0
9 If you want to feel like you're in the middle ... 5 1
UserID ProductID FakeReview StarRating Sentiment
count 276407.000000 276407.000000 276407.000000 276407.000000 276407.0
mean 54546.945714 458.374734 0.799180 4.510685 1.0
std 45782.341084 260.028822 0.601093 0.499887 0.0
min 923.000000 0.000000 -1.000000 4.000000 1.0
25% 14302.500000 247.000000 1.000000 4.000000 1.0
50% 41479.000000 468.000000 1.000000 5.000000 1.0
75% 87907.500000 671.000000 1.000000 5.000000 1.0
max 161147.000000 922.000000 1.000000 5.000000 1.0
UserID ProductID FakeReview StarRating Sentiment
count 47646.000000 47646.000000 47646.000000 47646.0 47646.0
mean 43767.620661 461.180876 0.866809 3.0 0.0
std 42529.124524 259.768302 0.498645 0.0 0.0
min 923.000000 0.000000 -1.000000 3.0 0.0
25% 9524.000000 247.000000 1.000000 3.0 0.0
50% 27353.000000 465.000000 1.000000 3.0 0.0
75% 68520.250000 672.000000 1.000000 3.0 0.0
max 161134.000000 922.000000 1.000000 3.0 0.0
UserID ProductID FakeReview StarRating Sentiment
count 34999.000000 34999.000000 34999.000000 34999.000000 34999.0
mean 63530.378097 470.505843 0.659533 1.593588 -1.0
std 47690.882281 259.055087 0.751686 0.491170 0.0
min 923.000000 0.000000 -1.000000 1.000000 -1.0
25% 20004.500000 251.000000 1.000000 1.000000 -1.0
50% 54501.000000 468.000000 1.000000 2.000000 -1.0
75% 100593.000000 688.000000 1.000000 2.000000 -1.0
max 161122.000000 922.000000 1.000000 2.000000 -1.0
# build fake/real into separate frames
dfPosFake = dfPositive[dfPositive['FakeReview'] == -1]
dfNegFake = dfNegative[dfNegative['FakeReview'] == -1]
dfPosReal = dfPositive[dfPositive['FakeReview'] == 1]
dfNegReal = dfNegative[dfNegative['FakeReview'] == 1]
print(dfPosReal.describe())
print(dfNegReal.describe())
print(dfPosFake.describe())
print(dfNegFake.describe())
UserID ProductID FakeReview StarRating Sentiment
count 248653.000000 248653.000000 248653.0 248653.000000 248653.0
mean 51907.433777 457.944244 1.0 4.499294 1.0
std 44874.485078 260.363084 0.0 0.500001 0.0
min 937.000000 0.000000 1.0 4.000000 1.0
25% 13409.000000 247.000000 1.0 4.000000 1.0
50% 38068.000000 468.000000 1.0 4.000000 1.0
75% 82866.000000 672.000000 1.0 5.000000 1.0
max 161147.000000 922.000000 1.0 5.000000 1.0
UserID ProductID FakeReview StarRating Sentiment
count 29041.000000 29041.000000 29041.0 29041.000000 29041.0
mean 58941.541820 470.547226 1.0 1.633002 -1.0
std 46871.256419 259.069635 0.0 0.481994 0.0
min 940.000000 0.000000 1.0 1.000000 -1.0
25% 16825.000000 251.000000 1.0 1.000000 -1.0
50% 47844.000000 468.000000 1.0 2.000000 -1.0
75% 93886.000000 688.000000 1.0 2.000000 -1.0
max 161122.000000 922.000000 1.0 2.000000 -1.0
UserID ProductID FakeReview StarRating Sentiment
count 27754.000000 27754.000000 27754.0 27754.000000 27754.0
mean 78194.800497 462.231570 -1.0 4.612741 1.0
std 47030.095529 256.987184 0.0 0.487133 0.0
min 923.000000 0.000000 -1.0 4.000000 1.0
25% 37674.250000 247.000000 -1.0 4.000000 1.0
50% 78376.500000 468.000000 -1.0 5.000000 1.0
75% 118765.750000 666.000000 -1.0 5.000000 1.0
max 161047.000000 922.000000 -1.0 5.000000 1.0
UserID ProductID FakeReview StarRating Sentiment
count 5958.000000 5958.000000 5958.0 5958.000000 5958.0
mean 85897.681605 470.304129 -1.0 1.401477 -1.0
std 45272.502481 259.005809 0.0 0.490238 0.0
min 923.000000 1.000000 -1.0 1.000000 -1.0
25% 48264.250000 250.250000 -1.0 1.000000 -1.0
50% 88427.500000 466.000000 -1.0 1.000000 -1.0
75% 126741.750000 688.000000 -1.0 2.000000 -1.0
max 161111.000000 922.000000 -1.0 2.000000 -1.0
# process reviews by removing stopwords
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in df['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
dfTokens.head(10):
0
0 great
1 food
2 -
3 great
4 drink
5 -
6 -
7 -
8 even
9 pair
dfLemma.head(10):
lem
0 - food - snack - - selection - popular greek dish - - appetizer tray - good - - - greek salad - - - underwhelmed - - main course - - - - table - - - - sometimes hard - get seat -
1 - solid - star - - greek food spot - - - - - fan - lamb - - - - - - come - - try - lamb sandwich - amazingly tender - juicy - onion - arugula - also - - good greek salad -
2 pretty cool place - good food - good people
3 - - - braise lamb sandwich - - - - - best sandwich - - life - - - - favour - try - place - friendly service - cosy atmosphere -
4 - good big greek cooking - - come - city - - gorgeous sunday - - brutal winter - - - first clear sunny crisp sunday - walk - soho - - - fav - - - - - - - hungry - decide - try - hole - - wall gem - literally - hole - - wall - - think - perfect - believe - - - table - - - - small - restroom - - - hall - - food - delicious - - - hummus - warm pita - lamb stew - fresh - - perfect - pastitsio - sp - - - perfect - portion - - enough - - dim - light lit candle - - - - perfect way - end - sunday - full tummy - wine - - real gem - - service - good hard - - - - - - small place - - feel - - - someone - home - - guest - - cooking - - home good -
5 - food - amaze - - service - equally amaze - - friend - - - definitely come back - - place -
6 - - - - - notice - - - - - review - - - - - healthiest eater - - - - try - - snack - - - best greek salad - - ever taste - big juicy tomato - crunchy fresh cucumber - fantastic olive oil dress - - - nt eat greek salad typcially - - - - eat - - snack - actually - - crave - - visting new york - - - try - -
7 - taramosalata - - die - - - recommend - shrimp santorini - also - - good friend - - greek love - restaurant - say - taste - authentic -
8 - tiny cafe - thompson - - - favorite - mine - year - - - - tell - - - everything - fresh - - attention - detail make - - keeper - - lamb sammie - ciabatta - melt - - mouth chunk - lamb - - roast onion - pretty much - die - - - - big enough - - gal - share - - sure - - favorite soup - - time - - rock - - - avgolemono - super lemony - perfect - - al dente orzo - serve - toast sliver - olive oil coat fresh bread - - - - take away bag - toss little twist - waxed paper fill - - - jordan almond - pretty adorable - - - - - sucker - - little touch -
9 really delicious sandwich - - lamb - - - enormous - - - able - eat - - - meal - tight - - - - - recommend grab - go - definitely - neat block - visit - lunch - - ever get bore - sullivan st - - accept credit card -
# sum of counts
print("Group by lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by lemma'd words, add count and sort:
Get just the first row in each lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 nt 163 nt nt NN 54697
1 place 41 place place NN 50843
2 food 0 food food NN 49892
3 good 8 good good JJ 48059
4 get 17 get get VB 42860
5 go 266 going go VB 39381
6 great 389 great great JJ 34853
7 come 26 come come VB 31299
8 order 458 order order NN 30485
9 time 222 time time NN 25805
Top 10 words by part of speech:
POS_TYPE: NN
lem index token stem pos counts
0 nt 163 nt nt NN 54697
1 place 41 place place NN 50843
2 food 0 food food NN 49892
8 order 458 order order NN 30485
9 time 222 time time NN 25805
13 wait 438 wait wait NN 22995
14 try 27 try tri NN 22476
15 service 56 service servic NN 21731
17 restaurant 186 restaurant restaur NN 19958
19 love 185 loves love NN 18815
POS_TYPE: JJ
lem index token stem pos counts
3 good 8 good good JJ 48059
6 great 389 great great JJ 34853
18 delicious 94 delicious delici JJ 19434
22 best 49 best best JJ 16375
33 nice 338 nice nice JJ 14203
34 little 243 little littl JJ 14056
45 much 213 much much JJ 11175
46 small 90 small small JJ 11139
62 fresh 100 fresh fresh JJ 9068
80 new 173 new new JJ 7892
POS_TYPE: VB
lem index token stem pos counts
4 get 17 get get VB 42860
5 go 266 going go VB 39381
7 come 26 come come VB 31299
12 make 201 makes make VB 23026
27 say 187 says say VB 15619
31 taste 152 tasted tast VB 14506
32 fry 876 fried fri VB 14325
35 take 239 take take VB 13754
39 amaze 133 amazing amaz VB 12150
41 give 1093 gave gave VB 11647
POS_TYPE: RB
lem index token stem pos counts
10 really 255 really realli RB 24657
11 well 307 better better RB 23251
16 back 140 back back RB 21142
20 also 35 also also RB 18654
28 even 511 even even RB 14871
36 definitely 138 definitely definit RB 13372
42 pretty 39 pretty pretti RB 11560
59 first 69 first first RB 9568
65 always 1977 always alway RB 8927
77 friendly 55 friendly friendli RB 8072
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
flatTokensList[:10]: ['the', 'food', 'at', 'snack', 'is', 'a', 'selection', 'of', 'popular', 'greek']
Frequency Distribution of all words[:30]: [('the', 462518), ('and', 284830), ('i', 238418), ('a', 224568), ('to', 172121), ('was', 159774), ('it', 139586), ('of', 134772), ('is', 111390), ('for', 99904), ('in', 91479), ('with', 81477), ('but', 79300), ('that', 73170), ('we', 69666), ('you', 69586), ('this', 64822), ('my', 63984), ('on', 57278), ('s', 54730), ('nt', 54697), ('had', 51391), ('not', 49412), ('were', 49195), ('food', 49084), ('they', 48617), ('good', 47878), ('so', 46517), ('place', 45590), ('have', 42379)]
Frequency Distribution of lemma[:30]: [('nt', 54697), ('place', 50843), ('food', 49892), ('good', 48059), ('get', 42860), ('go', 39381), ('great', 34853), ('come', 31299), ('order', 30485), ('time', 25805), ('really', 24657), ('well', 23251), ('make', 23026), ('wait', 22995), ('try', 22476), ('service', 21731), ('back', 21142), ('restaurant', 19958), ('delicious', 19434), ('love', 18815), ('also', 18654), ('dish', 16481), ('best', 16375), ('table', 16216), ('eat', 15703), ('sauce', 15684), ('friend', 15674), ('say', 15619), ('even', 14871), ('menu', 14814)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistAllWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in positive reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfPositive['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("Positive dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Positive dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
Positive dfTokens.head(10):
0
0 -
1 nt
2 say
3 enough
4 good
5 thing
6 -
7 -
8 place
9 -
Positive dfLemma.head(10):
lem
0 - solid - star - - greek food spot - - - - - fan - lamb - - - - - - come - - try - lamb sandwich - amazingly tender - juicy - onion - arugula - also - - good greek salad -
1 pretty cool place - good food - good people
2 - - - braise lamb sandwich - - - - - best sandwich - - life - - - - favour - try - place - friendly service - cosy atmosphere -
3 need - quick bite - stop - - - - review - - - really cute - small - - - - roast sandwich - - - good - service - - friendly - - - nice place - break - shopping
4 quick - delicious - fill - - - - hour - shopping - soho - - starve - - - nt accommodate - - - - - first - - - tiny - - - take - number - call - back - - min later - fresh ingredient - - flavor hit - - right note - - pastitsio - delicate - - hummus - creamy - - dolmades werent - dense - - tart - service - - - smile - - - definitely good - try - - mellow saturday afternoon -
5 novelty meet mediterranean meet soho - - place - - squeeze - - - - - seater table - - - - mean squeeze - - - - money spot - lunch - - quick - - go bite - go - - lamb sandwich - - dressing - fantastic flavor pairing - - - - look - something lighter - - - stomach - - wallet - try - soup - - mediterranean sandwich - full - veggie - - great variety - cute location - good food -
6 - place - tiny - - think - fit - people max - keep - - mind - consider - come - - - weekend night - - food - great - - set romantic - - recommend come - - - - - - area - - - - open seat - oh word - warn - - - guess - - standard - small shop establishment - nyc - - saw - roach - - wall - -
7 perfect - - - name implies - great butter bean salad - even well winter soup - perfect - date - - oneonone dinner - - - nt bring - part - - - - - - - - exclusively single table - seat maybe - - together - really - gem - especially - - - nt advertise - - - greek restaurant - - - food - definitely mediterranean -
8 yums - - - try - carp roe - - - sooooooooooooo good - - - - feta - tomato - - - combination platter - - jam -
9 small place big - taste - stop - - - wife - shopping - absolutely delicious - - friendly waitress -
# sum of counts
print("Group by positive lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each positive lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 positive words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by positive lemma'd words, add count and sort:
Get just the first row in each positive lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 place 22 place place NN 38165
1 nt 64 nt nt NN 35926
2 food 3 food food NN 35534
3 good 17 good good JJ 34960
4 get 353 get get VB 31360
5 great 131 great great JJ 29575
6 go 111 go go VB 28693
7 come 7 come come VB 22294
8 order 231 ordered order VB 20162
9 time 510 time time NN 19299
Top 10 positive words by part of speech:
POS_TYPE: NN
lem index token stem pos counts
0 place 22 place place NN 38165
1 nt 64 nt nt NN 35926
2 food 3 food food NN 35534
9 time 510 time time NN 19299
13 try 8 try tri NN 17078
16 love 242 love love NN 16338
18 service 37 service servic NN 14922
20 restaurant 200 restaurant restaur NN 14355
22 sauce 524 sauces sauc NN 12063
23 dish 333 dishes dish NN 11573
POS_TYPE: JJ
lem index token stem pos counts
3 good 17 good good JJ 34960
5 great 131 great great JJ 29575
12 delicious 58 delicious delici JJ 17378
21 best 30 best best JJ 13829
31 nice 53 nice nice JJ 10691
32 little 287 little littl JJ 10673
47 small 47 small small JJ 8115
54 fresh 74 fresh fresh JJ 7545
55 much 1379 much much JJ 7505
77 new 1295 new new JJ 6187
POS_TYPE: VB
lem index token stem pos counts
4 get 353 get get VB 31360
6 go 111 go go VB 28693
7 come 7 come come VB 22294
8 order 231 ordered order VB 20162
11 make 553 makes make VB 17636
15 wait 362 waited wait VB 16345
25 eat 265 eating eat VB 11418
29 amaze 376 amazing amaz VB 11175
33 fry 3261 fried fri VB 10611
35 say 530 say say VB 10172
POS_TYPE: RB
lem index token stem pos counts
10 really 45 really realli RB 17895
14 well 178 better better RB 16389
17 back 71 back back RB 15473
19 also 16 also also RB 14859
26 definitely 90 definitely definit RB 11334
34 even 177 even even RB 10330
50 pretty 20 pretty pretti RB 7684
52 always 1301 always alway RB 7641
61 first 66 first first RB 7107
65 friendly 36 friendly friendli RB 6870
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Positive flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Positive flatTokensList[:10]: ['a', 'solid', '', 'stars', 'for', 'this', 'greek', 'food', 'spot', '']
Positive Frequency Distribution of all words[:30]: [('the', 339730), ('and', 215930), ('i', 169996), ('a', 166721), ('to', 121316), ('was', 107807), ('it', 100928), ('of', 100117), ('is', 86846), ('for', 71858), ('in', 68609), ('with', 62532), ('but', 54269), ('you', 54261), ('that', 50960), ('we', 47933), ('this', 47727), ('my', 46778), ('on', 41822), ('s', 40978), ('had', 37770), ('nt', 35926), ('so', 35035), ('food', 34973), ('good', 34810), ('place', 34424), ('they', 34375), ('were', 33137), ('have', 31019), ('not', 30695)]
Positive Frequency Distribution of lemma[:30]: [('place', 38165), ('nt', 35926), ('food', 35534), ('good', 34960), ('get', 31360), ('great', 29575), ('go', 28693), ('come', 22294), ('order', 20162), ('time', 19299), ('really', 17895), ('make', 17636), ('delicious', 17378), ('try', 17078), ('well', 16389), ('wait', 16345), ('love', 16338), ('back', 15473), ('service', 14922), ('also', 14859), ('restaurant', 14355), ('best', 13829), ('sauce', 12063), ('dish', 11573), ('friend', 11426), ('eat', 11418), ('definitely', 11334), ('menu', 11315), ('chicken', 11196), ('amaze', 11175)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistPosWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNeutral['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("Neutral dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Neutral dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
Neutral dfTokens.head(10):
0
0 -
1 pizza
2 -
3 -
4 good
5 -
6 -
7 staff
8 -
9 helpful
Neutral dfLemma.head(10):
lem
0 - little place - soho - wonderful - - - - lamb sandwich - - glass - wine - - price shock - - - small - serve - - - - - - - - soho - - staff - - - little snotty - rude - - - food - great - - - nt expect worldclass service -
1 nice little greek restaurant - serf authentic greek dish - - nt go - - - look - - gyro - - - type - greek food - - ever - - - - - - small - quaint restaurant - seat - total - - people - - - food - - good - - - - veal - rice dish - - - portion - - - little small -
2 decent mediterranean place - - space - - small - - - dish - fresh - - service - friendly - - little overprice - - opinion - - - - nt understand - - - - extra charge - pita - especially - soup - hummus - beet salad - - fresh - - - large - - - - beet salad - soup - also tasty - hearty - - particular - lentil soup - - - tad - - salty side -
3 - really enjoy - experience - - tiny yet tasty restaurant - - - honest - - first big - - - - - seat almost immediately - - seat restaurant - - try go past pm - - - friend - jennifer - - - - get - fava - start - - - - - tasty - - thicker texture - hummus - - - - get extra pita - free - - - also get - classic greek avgolemon - - - - first time - - traditional dish - - - soup afterwards - - forever compare - - - - jennifer - say - - - miss - chicken - - even - - - - - - flavorful dish - jennifer get - special - - day lamb dish - - - uniquely flavor - - - - opinion - - bit expensive - - quantity - - - appreciate - - - - - - table - - - - rush - - - restaurant - - - - - people start wait - seat - - tiny space - - - automatically feel guilty - stay - long - - - - great food - - - big expensiveo - - - opinion - - - glad - get - chance - try - - -
4 pro - - food - actually - good con - someone - - nt drink wine - - alcohol - - matter - - picked - wine - - list - - - nt see - - - - - - way - - order - bottle - white - - swear - taste - someone pour - half - bottle - refill - - milwaukee - best - natty lite - also - - service - - - good - pleasant - - - - strange consider - - nt possibly fit - - - people - - entire restaurant -
5 peppinos - far superior - - typical neighborhood slice shop - - - nt - destination pizza place - - staff - friendly - - fault - - almost make - - - error - - food - - - - - last - time - - - order pie - - - - - put - wrong topping - - pie - - - - - picky guy - - - annoy - - - pay - - - - - pizza - - - - sure - - issue - - - hope - get - fix - delivery service - inconsistent - delivery time - - minute - - - food - arrive hot - time - - addition - - pie - salad - sandwich - also delicious - especially - pepperoni hero -
6 - - - - twice - - get - spinach ravioli - time - - - really good - - - - - complaint - - - go - yesterday - - pm - - place - filthy - - floor - disgust - - - - wipe crumb - - - seat -
7 nice pizza restaurant - marguerita pizza - great - - expensive
8 peppino - make - - solid pizza - - ingredient - top notch - - - dough - use - particularly good - - interior - - family style appeal - great neighborhood place - - - complaint - - - price - - - tad high - - local pizza joint - - small plain pizza - - - - - - small - totonno - -
9 - - - bad review - previously post - - make - trip - - - - delightfully surprised - - service - friendly - invite - - - pizza - - thin crust - - bit soggy - - - flavor played - - mouth - - perfect concerto - - make - salieri wannabe jealous - brick oven - - family environment add great touch - - real hero - - pizza - - - make - trip - - try - pasta - - - continued
# sum of counts
print("Group by neutral lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each neutral lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 neutral words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by neutral lemma'd words, add count and sort:
Get just the first row in each neutral lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 nt 19 nt nt NN 10076
1 good 46 good good JJ 9165
2 food 17 food food NN 7420
3 place 1 place place NN 7061
4 get 110 got got VB 6521
5 go 32 go go VB 5436
6 come 376 came came VB 4967
7 order 194 ordered order VB 4959
8 really 89 really realli RB 4240
9 well 452 better better RB 3879
Top 10 neutral words by part of speech:
POS_TYPE: NN
lem index token stem pos counts
0 nt 19 nt nt NN 10076
2 food 17 food food NN 7420
3 place 1 place place NN 7061
11 time 127 time time NN 3510
13 try 104 try tri NN 3334
14 service 22 service servic NN 3311
17 restaurant 26 restaurant restaur NN 2742
18 table 154 tables tabl NN 2673
19 dish 30 dishes dish NN 2658
26 friend 108 friend friend NN 2346
POS_TYPE: JJ
lem index token stem pos counts
1 good 46 good good JJ 9165
12 great 18 great great JJ 3397
24 nice 23 nice nice JJ 2375
25 little 0 little littl JJ 2360
32 much 1104 much much JJ 2067
42 small 10 small small JJ 1811
55 delicious 271 delicious delici JJ 1477
58 best 205 best best JJ 1456
62 bad 330 bad bad JJ 1418
71 overall 868 overall overal JJ 1200
POS_TYPE: VB
lem index token stem pos counts
4 get 110 got got VB 6521
5 go 32 go go VB 5436
6 come 376 came came VB 4967
7 order 194 ordered order VB 4959
10 wait 159 waiting wait VB 3517
16 make 235 makes make VB 2892
22 say 135 said said VB 2522
23 taste 198 tasted tast VB 2508
29 give 377 give give VB 2192
34 take 1030 took took VB 2064
POS_TYPE: RB
lem index token stem pos counts
8 really 89 really realli RB 4240
9 well 452 better better RB 3879
15 back 558 back back RB 2906
20 pretty 627 pretty pretti RB 2569
21 also 80 also also RB 2523
39 even 138 even even RB 1873
51 definitely 705 definitely definit RB 1558
67 first 97 first first RB 1365
69 still 1333 still still RB 1335
75 maybe 1021 maybe mayb RB 1186
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Neutral flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Neutral Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Neutral Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Neutral flatTokensList[:10]: ['this', 'little', 'place', 'in', 'soho', 'is', 'wonderful', '', 'i', 'had']
Neutral Frequency Distribution of all words[:30]: [('the', 70369), ('and', 38237), ('i', 37776), ('a', 33433), ('was', 29465), ('to', 25569), ('it', 22772), ('of', 19388), ('but', 15686), ('for', 15603), ('is', 14668), ('in', 12669), ('that', 11733), ('with', 11172), ('nt', 10076), ('we', 9917), ('not', 9794), ('good', 9143), ('my', 8938), ('were', 8676), ('on', 8646), ('you', 8532), ('this', 8258), ('s', 8055), ('had', 7600), ('food', 7297), ('they', 7111), ('so', 6428), ('place', 6253), ('at', 5934)]
Neutral Frequency Distribution of lemma[:30]: [('nt', 10076), ('good', 9165), ('food', 7420), ('place', 7061), ('get', 6521), ('go', 5436), ('come', 4967), ('order', 4959), ('really', 4240), ('well', 3879), ('wait', 3517), ('time', 3510), ('great', 3397), ('try', 3334), ('service', 3311), ('back', 2906), ('make', 2892), ('restaurant', 2742), ('table', 2673), ('dish', 2658), ('pretty', 2569), ('also', 2523), ('say', 2522), ('taste', 2508), ('nice', 2375), ('little', 2360), ('friend', 2346), ('chicken', 2261), ('sauce', 2212), ('give', 2192)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistNeutWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNeutLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNegative['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("Negative dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Negative dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
Negative dfTokens.head(10):
0
0 -
1 nt
2 bother
3 -
4 -
5 problem
6 -
7 -
8 wet
9 -
Negative dfLemma.head(10):
lem
0 - meaning - try - place - - whilehighly recommend - - friend - - - tuna sandwich - good - get terribly sick - word - also - sage tea - nice -
1 stop - - lunch takeout - work - - ask - server - - well - roast vegetable sandwhich - vegetable souvlaki - - reply boastfully - - vegetarian souvlaki - apparently - enjoys tzatziki sauce run - - - pita - - - hand - - - - face - unfortunately - - - - - - taste - nt make - - - mess -
2 - walk halfway - manhattan - - restaurant - - wish - - stayed - home - - - decent food - - - seem - - get - - - - corner deli - - flavor seem bland - - menu cryptic - actual ingredient - - salad - - - give kudos - pleasant wine - attentive - - - overbear service -
3 hmm - - far - - - - impressed - stop - - grab - sandwich - take - - woman - - counter clearly wish - - hurry - - make - decision - - place - order - - give - - total - - - - - want - give - exactly - amount - - - - take - - penny - make - - - cent - - cent - - oh - - penny - - - nt - penny - - - - - - - - - - - suppose - know - - - - - nt accept penny - - nt penny money - - - - take - dime - - - penny - wtf - - give - - quarter instead - - - course - get back - dime - - - - need - - change - - told - - sit - - - - sandwich - - brought - - - ready - fine - - - - - seat - anyway - sat - - - bench - tick - tock - tick - tock - - minute go - - - - - - wonder seriously - long - take - make - bloody sandwich - - - go - - ask - - - turn - - sandwich - - sit - - counter - whole time - - girl - - different - - - - - counter say - - oh - - - - - - - - - employee - talk - - - - - place - - mean - - - sit - day - - bench - - watch - yuppie family pile - dknysporting toddler - - luxury suv - - - think - order - sandwich - - - hungry - - sandwich - - - turn - - - really good - lovely ciabatta bread - fresh ingredient - - order - marinate sandwich - - - allinall delightful - - - alidoro - - half - block away - - think - - give snack place - miss next time -
4 - food - average pizzeria - - cheap - add - - - fact - - puked - gut - - - bathroom - - - meal - - - - decide - - go back -
5 want - love - - look great come - - fell short - - - much oilgrease - good amount - cheese - sauce - even - crust - - good texture - - - soft - - - crispy - char - - - - flavor - - star - - sure - experienced well slice - - - star - friendly service - - - - - slicesoda recession deal - - nice owner - - say -
6 food - creative - thought provoke - - make sure - eat - - go - portion - ridiculously small - left - feel shortchanged - hungry - - - celery oyster stew - - know - - brooklyn - - - - - potato - oyster cracker filler cost - much - - stew scarcely cover - bottom - - cavernous bowl - conversation - dinner tend - echo - - expose side - say bowl - heard - story twice - dekalb - maybe focus less - furbish - wall - - restaurant - - - - furbish - wall - - bowl - cheapingredientyetfancynamedstew - probably - nt - - next time - p - semiredeeming quality - squash tot - good concept - - - squash - becomes - - - verb - meeting - fork -
7 - - - excite - try - place - - - - - - - block away - look packed whenever - walk - - holy schizza - - disapointed - - go - brunch - - busy sunday afternoon - - nt wait - get - food - - - pretty hungry - first - - - mimosa - lukewarm - second - - fry - lukewarm - well - taste - - - - cooked - minute ago - - din room - packed - - - - turnover - - decent - - food fresh - - egg - - omelette - - thick - - egg - fill ratio - - - - - - - - - thinly slice mushroom - - entire omelette - - half inch pancake concoction - egg - - - - service - - linger - meal - coffee - - eventually finish - cup - - see - waiter walk - - - fresh pot - coffee - - - already think - - head - - thanks - - thanks - - - - nt even get - chance - kindly pas - refill - coffee - - walk right - - - fill - lady - cup - go right back - put - pot away - maybe le parisien caught - - - bad day - mediocre food - - average service - nt belong - ny - - - good note - - - really cute - - - decor seem tres french -
8 - place - - deserve - star - - smoke salmon - old - taste horrible - - english muffin - - egg florentine - soggy - - omelette - greasy - - website make - restaurant seem upscale - - - cramped - shabby - - - - small toilet next - - kitchen - - trust yelp - want - bring - friend - - beautiful french restaurant - brunch - - - disgust - - instead - - - circumstance - - place rate - star -
9 go - - - second time - - - nt - good - - remember - order - pasta - steak frites - mussel appetizer - - drink - overall - underwhelming - total bill include tip come - - lil - - - - maybe people go - - - - - bistro - - area -
# sum of counts
print("Group by negative lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 negative words by part of speech:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by negative lemma'd words, add count and sort:
Get just the first row in each negative lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 nt 43 nt nt NN 8346
1 food 54 food food NN 6852
2 place 2 place place NN 5643
3 get 9 got got VB 5069
4 go 141 go go VB 5030
5 order 89 order order NN 4608
6 good 8 good good JJ 4262
7 come 223 coming come VB 3914
8 service 73 service servic NN 3241
9 time 156 time time NN 3218
Top 10 negative words by part of speech:
POS_TYPE: NN
lem index token stem pos counts
0 nt 43 nt nt NN 8346
1 food 54 food food NN 6852
2 place 2 place place NN 5643
5 order 89 order order NN 4608
8 service 73 service servic NN 3241
9 time 156 time time NN 3218
10 wait 342 wait wait NN 3145
13 restaurant 49 restaurant restaur NN 2862
14 table 586 table tabl NN 2812
21 try 1 try tri NN 2119
POS_TYPE: JJ
lem index token stem pos counts
6 good 8 good good JJ 4262
23 bad 422 bad bad JJ 2051
29 great 222 great great JJ 1703
33 much 226 much much JJ 1571
57 nice 16 nice nice JJ 1129
61 small 264 small small JJ 1083
70 little 525 little littl JJ 922
73 best 638 best best JJ 907
85 many 767 many mani JJ 799
92 next 204 next next JJ 769
POS_TYPE: VB
lem index token stem pos counts
3 get 9 got got VB 5069
4 go 141 go go VB 5030
7 come 223 coming come VB 3914
11 say 160 says say VB 2938
17 make 44 make make VB 2517
19 take 80 take take VB 2326
20 ask 21 asked ask VB 2163
22 give 67 give give VB 2069
25 want 92 wanted want VB 1947
31 know 106 know know VB 1589
POS_TYPE: RB
lem index token stem pos counts
12 well 23 better better RB 2891
15 back 122 back back RB 2587
16 really 183 really realli RB 2521
18 even 232 even even RB 2349
34 never 613 never never RB 1555
37 also 13 also also RB 1471
46 first 347 first first RB 1334
62 pretty 345 pretty pretti RB 1065
71 maybe 298 maybe mayb RB 921
72 still 649 still still RB 916
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Negative flatTokensList[:10]: ['been', 'meaning', 'to', 'try', 'this', 'place', 'for', 'a', 'whilehighly', 'recommended']
Negative Frequency Distribution of all words[:30]: [('the', 53142), ('and', 30653), ('i', 29513), ('to', 24624), ('a', 23473), ('was', 22439), ('it', 16114), ('of', 15044), ('for', 11857), ('we', 11046), ('that', 10699), ('in', 10512), ('is', 10036), ('but', 9389), ('not', 8920), ('this', 8353), ('nt', 8346), ('my', 8013), ('with', 7458), ('were', 7155), ('you', 6863), ('they', 6796), ('food', 6761), ('on', 6700), ('had', 6235), ('at', 6051), ('s', 5612), ('so', 5521), ('have', 5212), ('place', 4960)]
Negative Frequency Distribution of lemma[:30]: [('nt', 8346), ('food', 6852), ('place', 5643), ('get', 5069), ('go', 5030), ('order', 4608), ('good', 4262), ('come', 3914), ('service', 3241), ('time', 3218), ('wait', 3145), ('say', 2938), ('well', 2891), ('restaurant', 2862), ('table', 2812), ('back', 2587), ('really', 2521), ('make', 2517), ('even', 2349), ('take', 2326), ('ask', 2163), ('try', 2119), ('give', 2069), ('bad', 2051), ('taste', 2034), ('want', 1947), ('eat', 1934), ('people', 1803), ('friend', 1768), ('great', 1703)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistNegWords.csv", encoding = 'utf-8', index = False, header = False)
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfPosReal['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("positive real dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("positive real dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
positive real dfTokens.head(10):
0
0 holy
1 god
2 -
3 -
4 -
5 live
6 -
7 nyc
8 -
9 -
positive real dfLemma.head(10):
lem
0 best place - brunch - - - handle - wait - definitely get - mac - cheese -
1 freeman - - - hyped - belief - - - - bush twin rumor - - - - - - - still good food - - devilsonhorseback - prune stuffed - stilton cheese wrap - bacon - - lovely - - mac - cheese - - - someone - grandma make - - - winebytheglass choice - good - everything - pretty affordable - everyone say - - impossible - get - table - - - - monday - - - place - empty - - waitstaff - really friendly - helpful - -
2 cozy little greek place - love - meze - highly recommend - skordalia - potato garlic puree - - - tzatziki - cucumber yogurt dip - - - also - - - traditional dish - - think - greek food - spanakopitakia - spinach pie - pastitsio - mousaka - - nt forget - greek dessert - yogurt - - honey - - course - - halva - baklava -
3 pylos honor - breadth - traditional greek cuisine - bring fresh cooking - - region - greece - - elegant - contemporary - comfortable set - - east village - next time - - - new york - - nt miss - hidden jewel - - owner christos - - classy man - - take care - - - - - - - best friend - pylosrestaurantcom
4 go - - sunday brunch - - - visit - - - - satisfy meal - - egg - toast - home potato - - nice thick bacon - slice - chocolatepeanut butter cream pie share - - - - end - - nice topper - - impressed - - quality buttermilk biscuit - - table - seat - - greatly need - - long wait - make - starve - bit pricey - - wait detract - star - - - - definately recommend - - lazy sunday meal -
5 come - - buffalo wing - stay - - catfish burger - also feature - buffalo - ny favorite - beef - weck - hockey - ever - - - - - tv - good fry - serve - chipotle mayo -
6 expert - polenta - homemade hummus - chicken - brie sandwich - - pretty much everything else - bonnie - - reliably excellent food - - - loses point - - occasional excessively greasy sirloin burger - - - - - fool - - put - chipotle mayo - - - fry - scarf - - - - - also - amaze beer selection -
7 best espresso - nyc - - - good friend google agrees - - - - - - go - - little - - - - nyc visit friend - - buddy josh - - foodie - - espresso naziconnisseur - show - - new spot - - - tell - - - - want - best espresso - town - - - pay - - cab - let - go - - - - tell - - - walk - - joint - - block away - - - - way - tell - - acidity - foam - temperature - - - - - - sit - - - charm caferestaurant - - thoroughly impressed - - place smell italian - - - tough - explain - old place - europe - - funky smell - - remember forever - - joint - - import - euro stank - authenticity - - work - - - charm - - - pant - - espresso - - best - - ever - - - - drank - share - - - - - acidic - linger - - - mouth - - deep richness - - get home - - - forgotten - name - - amaze cafe - - - know - much - - critic josh - - - google - best espresso nyc - - low - behold - - first result - - quadronno - check - - website - - - see - - sat right - - viking mural -
8 locate - - heart - manhattan - theater district - carmine - - - pack - - - - open - june - - - - - pasta - sound steep - - everything - - primarily - - southern region - italy - - serve family style - - huge platter overflow - food - - need - come - - - empty stomach - - willingness - set aside - diet regime - - day - - carmine - chef - stuff - - - typical italian mother - believe - - - - - waiter - actually advise - - cut back - - feel - - order - much food - - party - - indulge - - hot antipasto - penne - la vodka - - chicken marsala - - brought home enough leftover - - - meal - - carmine - - nt serve - - quantity - everything - fresh - deliciously season - - cooked - order - especially - pasta - - - perfectly al dente - dessert - - delicious - - - - never - room - sample - - - choice - - atmosphere - festive - - - big fat italian wedding - - - - go - - - - - hour - - - pm - - - urgent - - make reservation well - advance - - - mean week - - day - - - otherwise - wait - - table - - long - - accept reservation - - size party - pm - - - pm - - party - - - - - open - pm sunday - monday - midnight - rest - - week - - - - - perfect place - - posttheater supper - - - - nt mind go - bed - - - full stomach - - visit - new york city - complete - - meal - carmine - - - - - recommend - gem enough -
9 - nt go wrong - - - - burger - long line - poor service - - - beef - good - however - - - best - - city - see - review -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort:
Get just the first row in each negative lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 place 1 place place NN 34855
1 nt 76 nt nt NN 33788
2 good 16 good good JJ 32273
3 food 17 food food NN 31760
4 get 6 get get VB 29120
5 great 661 great great JJ 26826
6 go 119 went went VB 26333
7 come 163 come come VB 21021
8 order 363 ordering order VB 19094
9 time 103 time time NN 17682
Top 10 words by part of speech used in real reviews:
POS_TYPE: NN
lem index token stem pos counts
0 place 1 place place NN 34855
1 nt 76 nt nt NN 33788
3 food 17 food food NN 31760
9 time 103 time time NN 17682
13 try 673 try tri NN 15837
14 wait 4 wait wait NN 15251
16 love 53 love love NN 14969
19 service 463 service servic NN 13531
20 restaurant 579 restaurants restaur NN 12877
22 sauce 824 sauce sauc NN 11567
POS_TYPE: JJ
lem index token stem pos counts
2 good 16 good good JJ 32273
5 great 661 great great JJ 26826
12 delicious 396 delicious delici JJ 16104
21 best 0 best best JJ 12730
32 nice 129 nice nice JJ 9830
33 little 50 little littl JJ 9745
45 small 1282 small small JJ 7572
51 fresh 92 fresh fresh JJ 7138
54 much 192 much much JJ 7044
77 new 104 new new JJ 5627
POS_TYPE: VB
lem index token stem pos counts
4 get 6 get get VB 29120
6 go 119 went went VB 26333
7 come 163 come come VB 21021
8 order 363 ordering order VB 19094
11 make 30 made made VB 16254
31 amaze 213 amazing amaz VB 9968
34 say 38 says say VB 9525
38 take 114 take take VB 8628
50 seat 146 seating seat VB 7306
55 give 1669 gives give VB 6889
POS_TYPE: RB
lem index token stem pos counts
10 really 46 really realli RB 16676
15 well 413 well well RB 15039
17 back 361 back back RB 14215
18 also 65 also also RB 13760
24 definitely 5 definitely definit RB 10709
35 even 499 even even RB 9437
49 pretty 35 pretty pretti RB 7324
56 always 629 always alway RB 6806
65 first 306 first first RB 6434
72 friendly 47 friendly friendli RB 6023
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("PosRealReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("PosRealReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("PosRealReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
PosRealReview flatTokensList[:10]: ['best', 'place', 'for', 'brunch', 'if', 'you', 'can', 'handle', 'the', 'wait']
PosRealReview Frequency Distribution of all words[:30]: [('the', 314511), ('and', 200258), ('i', 159158), ('a', 155092), ('to', 112463), ('was', 100859), ('it', 94501), ('of', 93623), ('is', 79486), ('for', 66962), ('in', 63505), ('with', 58824), ('but', 50548), ('you', 49955), ('that', 47654), ('we', 44251), ('this', 44214), ('my', 43657), ('on', 38881), ('s', 38550), ('had', 35069), ('nt', 33788), ('so', 32750), ('they', 32167), ('good', 32137), ('place', 31376), ('food', 31202), ('were', 30978), ('not', 28659), ('have', 28368)]
PosRealReview Frequency Distribution of lemma[:30]: [('place', 34855), ('nt', 33788), ('good', 32273), ('food', 31760), ('get', 29120), ('great', 26826), ('go', 26333), ('come', 21021), ('order', 19094), ('time', 17682), ('really', 16676), ('make', 16254), ('delicious', 16104), ('try', 15837), ('wait', 15251), ('well', 15039), ('love', 14969), ('back', 14215), ('also', 13760), ('service', 13531), ('restaurant', 12877), ('best', 12730), ('sauce', 11567), ('dish', 11249), ('definitely', 10709), ('eat', 10628), ('friend', 10589), ('menu', 10494), ('chicken', 10245), ('fry', 10192)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosRealRevLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNegReal['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("negative real dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("negative real dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
negative real dfTokens.head(10):
0
0 -
1 first
2 experience
3 -
4 -
5 restaurant
6 -
7 get
8 -
9 -
negative real dfLemma.head(10):
lem
0 middle eastern cuisine - - mediocre - - ok - - - drunk - want something salty - crunchy - - - give kudos - - amount - crap - - - able - magically fit - - pitahummus - falafel - babaganoush - onion - pickle - cabbage - lettuce - saucebut - - walk - - - - - - - time - - stomach ache - - - people - - know - make - wonder - - really - - tahini -
1 - spending - - - - people - include tip - - cheap bottle - wine - - - - - - satisfied - - say - least - instead - - - left - - delicious memory - yesterday - dinner - probably - - - nt - - delicious - definitely - delicious enough - - value - - food - good - - - even close - great - - agree - - previous review - - - - - - hype - - heard - read - - restaurant - - leaf - wonder - - best - - memorable part - - meal - - complementary muffin - - next day - breakfast - - recommend -
2 pommes frites make - lot - noise - - - double fry - potato - - - - - suppose - - - - - - - - many oddly cooked - mushyontheinside fry - want - go back - - sauce - definitely worth try - - - staff - always willing - give free sample - - last time - - - - - - - liquor license - - - suggest - small deli next door - alcoholic refreshment - - go several time - try - - - - - - - end - prefer - frites - - cafe du bruxelles - - - le halle -
3 - - - hype - - - nt get - - - really disliked - place - hamburger - - flavor - - - - bun - unimpressive - burger joint - shake shack - jg melon - blow - place away -
4 - food - - good - - love - design - - din room - - open kitchen area - however - - get - little noisy - - service - - bit haphazard -
5 defintely - unique place especially - - - risotto - - portion - kind - skimpy - - price - - - get - roast chicken - asparagus - pine nut mix - - - tad bland - - sticky - perhaps - - - give - - shot - - different order - - - nt - impressed -
6 overated pizza - uneven balance - cheese - - great - - - - tourist look - - sample - institution - try grimaldi - - - want - real ny pie -
7 mediocre - - - nt see - everyone make - deal - - place - maybe - - order - bowl - berry - - meal - - try - friend meatloaf - - - ok - - heard - - breakfast - horrible - - decor - - - date - trendy - - look - - keep - clean - - service - - good - - - overall - - - nt impressed -
8 - place - overrate - overprice - overhipsterified - - - - give - kid balloon - - - - photo booth - - basement - - nt mean - - worth - visit -
9 - - nt - - - - - food suck - - service - wish - go elsewhere -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort:
Get just the first row in each negative lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 nt 52 nt nt NN 7314
1 food 58 food food NN 5640
2 place 137 place place NN 4849
3 get 134 get get VB 4421
4 go 97 go go VB 4281
5 order 190 order order NN 4074
6 good 59 good good JJ 3716
7 come 264 come come VB 3465
8 time 26 times time NN 2786
9 wait 370 waited wait VB 2773
Top 10 words by part of speech used in real reviews:
POS_TYPE: NN
lem index token stem pos counts
0 nt 52 nt nt NN 7314
1 food 58 food food NN 5640
2 place 137 place place NN 4849
5 order 190 order order NN 4074
8 time 26 times time NN 2786
10 service 164 service servic NN 2675
13 table 373 table tabl NN 2483
14 restaurant 69 restaurant restaur NN 2359
23 taste 1400 taste tast NN 1766
24 want 6 want want NN 1723
POS_TYPE: JJ
lem index token stem pos counts
6 good 59 good good JJ 3716
25 bad 437 bad bad JJ 1686
29 great 62 great great JJ 1523
32 much 420 much much JJ 1377
57 nice 1026 nice nice JJ 980
62 small 114 small small JJ 905
70 little 162 little littl JJ 807
72 best 72 best best JJ 796
86 next 78 next next JJ 684
89 many 91 many mani JJ 677
POS_TYPE: VB
lem index token stem pos counts
3 get 134 get get VB 4421
4 go 97 go go VB 4281
7 come 264 come come VB 3465
9 wait 370 waited wait VB 2773
12 say 43 say say VB 2522
15 make 31 makes make VB 2310
19 take 397 take take VB 1924
20 ask 570 asked ask VB 1791
21 try 102 trying tri VB 1782
22 give 10 give give VB 1771
POS_TYPE: RB
lem index token stem pos counts
11 well 363 well well RB 2531
16 really 33 really realli RB 2280
17 back 98 back back RB 2170
18 even 60 even even RB 2075
37 also 638 also also RB 1263
41 never 441 never never RB 1189
50 first 374 first first RB 1087
56 pretty 1260 pretty pretti RB 1029
66 still 433 still still RB 853
75 long 371 long long RB 779
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative RealReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative RealReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative RealReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Negative RealReview flatTokensList[:10]: ['middle', 'eastern', 'cuisine', 'that', 's', 'mediocre', '', 'and', 'ok', 'if']
Negative RealReview Frequency Distribution of all words[:30]: [('the', 45943), ('and', 26254), ('i', 25613), ('to', 20932), ('a', 20391), ('was', 19491), ('it', 14226), ('of', 13363), ('for', 10324), ('we', 9543), ('that', 9441), ('in', 9052), ('is', 8636), ('but', 8392), ('not', 7654), ('nt', 7314), ('this', 7114), ('my', 6921), ('with', 6398), ('were', 6310), ('you', 5946), ('on', 5842), ('they', 5688), ('food', 5568), ('had', 5176), ('at', 5082), ('s', 4904), ('so', 4707), ('have', 4416), ('place', 4193)]
Negative RealReview Frequency Distribution of lemma[:30]: [('nt', 7314), ('food', 5640), ('place', 4849), ('get', 4421), ('go', 4281), ('order', 4074), ('good', 3716), ('come', 3465), ('time', 2786), ('wait', 2773), ('service', 2675), ('well', 2531), ('say', 2522), ('table', 2483), ('restaurant', 2359), ('make', 2310), ('really', 2280), ('back', 2170), ('even', 2075), ('take', 1924), ('ask', 1791), ('try', 1782), ('give', 1771), ('taste', 1766), ('want', 1723), ('bad', 1686), ('eat', 1583), ('friend', 1544), ('people', 1528), ('great', 1523)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegRealRevLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfPosFake['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("fake dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("fake dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
fake dfTokens.head(10):
0
0 -
1 place
2 -
3 recommend
4 -
5 -
6 coworker
7 -
8 come
9 -
fake dfLemma.head(10):
lem
0 - right - - - - - deal - - reality - - gramercy tavern - - - bar - night - - week - - eat - - tavern - - - - cocktail - great - - food - solid - - - bartender monthursday - - - - best - ever - - - drink whiskey - - seriously drink - leave la vega seriously - get - - flatiron - even - - - - - menu - also - everything - - ever heard - - service - true - - - ny - - nt - - rare - - food - - back - phenomenal - shy away - - rabit - thumper - - thing - - - - inevitably surround - - series - event dinner - anniversary - funeral - first orgasm - - always get - - something - - - meal - leave - - - prix fixe coma - - - - need - entertain - prospective inlaws - eat - front - talk trash - - bar - marvel - - - - move - many people - elegantly - - restuarant - - drink - - - best cocktail - america - - try - rare german wine - - meal - - get - - - - - - - fyi - - - - single joint -
1 - - - favorite place - - city - - pizza - - worth - hour long wait - take - seat - - bar - - - corner cubby - - drink - - - - fine - lombardi - - also - - - - place willing - serve - - magnum - wine - - really decent price - - perfect place - low key night - good friend - - - avoid delivery - - - - disappointed - single time -
2 - place really know - - - - - foodie kind - joint - quail - varities - fish - rabbit - etc - - - tapasstyle serving - great wine list - hard - go wrong - -
3 - place - great - - people - - tapa plate - - single - - - homerun - sit - watch - food - cooked make - - much well - - nt - eat - cramped tight space - - - make - experience - personal - - - wait - - - hour - - great - - - sent - - corner - bar jamon - - great experience -
4 - order - get tapa - - small table - - need - wait - least - min - - saturday night - - rest assure - - - leave - - satisfied customer - try - - - look - place - - pas - - - - - little hole - - wall - - result - leave - ask - - - - tapa - - tasty especially - patatas fritas - bread wfava bean spread - - pork sandwich - - drink - recommed - white sangria wstrawberries - - finish - night - nice dessert - cake wdulce de leche ice cream - enjoy -
5 wow - - - great restaurant - food - decor - service - top notch - try - rotisserie chicken - - fresh shrimp - - dessert area fantastic - - nt wait - - second visit -
6 feel - southern delight - gotcha - - - - - right place - - gumbo go - - fast - - finger - - - thumbsup - ready - - hot stuff - - cajun martini - - - - make - walk - - wall - - - ceiling - back - - - - side - - - - want - stay - - hangover breakfast - tickle - sore head - - best bloody mary - town -
7 - - - - numerous time - - taste menu - lunch - - service - - food - consistently excellent lot - small touch - really add - - - superb experience - meal - - little bit - - splurge - - definitely worth - -
8 - expensive - - - nice - course mealroom - - little loud - service - excellent - - - - place - royal people - huge gorgeous crystal chandelier overhead - - - live roam violinist - enjoy - food - sip - - wine - - hear - - live music
9 - food - terrific - - dinner - delicious - elegant - brunch - great - classic - super sandwich - great - brunch - family - kid - wonderful service - - - - favorite spot - - neighborhood -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each positive fake lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort:
Get just the first row in each positive fake lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 food 12 food food NN 3495
1 place 93 places place NN 3121
2 great 11 great great JJ 3075
3 good 122 good good JJ 2521
4 go 146 go go VB 2275
5 nt 37 nt nt NN 1926
6 get 26 get get VB 1733
7 service 34 service servic NN 1511
8 love 403 love love NN 1466
9 time 128 time time NN 1427
Top 10 words by part of speech used in real reviews:
POS_TYPE: NN
lem index token stem pos counts
0 food 12 food food NN 3495
1 place 93 places place NN 3121
5 nt 37 nt nt NN 1926
7 service 34 service servic NN 1511
8 love 403 love love NN 1466
9 time 128 time time NN 1427
12 restaurant 234 restaurant restaur NN 1286
14 try 83 try tri NN 1204
17 wait 99 wait wait NN 1123
20 order 179 order order NN 1020
POS_TYPE: JJ
lem index token stem pos counts
2 great 11 great great JJ 3075
3 good 122 good good JJ 2521
10 best 16 best best JJ 1359
11 delicious 336 delicious delici JJ 1307
24 nice 223 nice nice JJ 845
43 new 611 new new JJ 591
49 fresh 243 fresh fresh JJ 568
54 little 199 little littl JJ 540
63 small 182 small small JJ 453
78 much 160 much much JJ 395
POS_TYPE: VB
lem index token stem pos counts
4 go 146 go go VB 2275
6 get 26 get get VB 1733
13 make 159 made made VB 1208
15 come 595 come come VB 1194
21 amaze 475 amazing amaz VB 1015
25 eat 8 eating eat VB 810
34 say 1196 said said VB 664
37 taste 287 tasting tast VB 632
39 take 100 take take VB 611
59 fry 1057 fried fri VB 495
POS_TYPE: RB
lem index token stem pos counts
16 back 40 back back RB 1124
18 really 114 really realli RB 1115
19 well 161 better better RB 1088
23 also 30 also also RB 878
27 always 56 always alway RB 776
30 definitely 305 definitely definit RB 728
31 even 28 even even RB 715
33 friendly 820 friendly friendli RB 670
47 atmosphere 676 atmosphere atmospher RB 576
51 ever 17 ever ever RB 558
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Positive FakeReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive FakeReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Positive FakeReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Positive FakeReview flatTokensList[:10]: ['all', 'right', '', 'so', 'here', 's', 'the', 'deal', '', 'the']
Positive FakeReview Frequency Distribution of all words[:30]: [('the', 22041), ('and', 14820), ('i', 10484), ('a', 10136), ('to', 8143), ('is', 7093), ('was', 6140), ('it', 5870), ('of', 5704), ('for', 4617), ('in', 4475), ('with', 3610), ('you', 3475), ('this', 3460), ('food', 3458), ('we', 3059), ('great', 3052), ('but', 3029), ('my', 2927), ('place', 2897), ('that', 2846), ('good', 2509), ('on', 2400), ('had', 2380), ('s', 2362), ('are', 2317), ('they', 2309), ('have', 2279), ('so', 2129), ('very', 1947)]
Positive FakeReview Frequency Distribution of lemma[:30]: [('food', 3495), ('place', 3121), ('great', 3075), ('good', 2521), ('go', 2275), ('nt', 1926), ('get', 1733), ('service', 1511), ('love', 1466), ('time', 1427), ('best', 1359), ('delicious', 1307), ('restaurant', 1286), ('make', 1208), ('try', 1204), ('come', 1194), ('back', 1124), ('wait', 1123), ('really', 1115), ('well', 1088), ('order', 1020), ('amaze', 1015), ('pizza', 946), ('also', 878), ('nice', 845), ('eat', 810), ('friend', 785), ('always', 776), ('staff', 765), ('menu', 758)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosFakeRevLemma.csv", encoding = 'utf-8', index = False, header = False)
# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []
for review in dfNegFake['Review']:
ITER += 1
if ITER % SAMPLE_STEP != 1:
continue
# tokenize and lowercase each review
tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]
# process tokens in each review
tokensFromReview = []
lemmaTokensFromReview = []
for token in tokens:
# remove accent marks and punctuation
newToken = remove_accents(token)
newToken = str(newToken).translate(string.punctuation)
tokensFromReview.append(newToken)
# add representation to Lemma for non-match (will be removed when match found)
lemmaTokensFromReview.append("-")
# process newToken to remove stopwords and get word counts
if newToken not in stopwords:
if re.search(RE_VALID, newToken):
if(len(newToken)) >= MIN_STR_LEN:
# parse token as part of speech, with default to noun
pos = nltk.pos_tag([newToken])[0][1][:2]
pos2 = 'n'
if pos in DI_POS_TYPES:
pos2 = DI_POS_TYPES[pos]
stem = stemmer.stem(newToken)
lem = lemmatizer.lemmatize(newToken, pos = pos2)
if pos in POS_TYPES:
reviewsTokens.append((newToken, stem, lem, pos))
lemmaTokensFromReview = lemmaTokensFromReview[:-1]
lemmaTokensFromReview.append(lem)
# build reviews lists
reviewsTokenLists.append(tokensFromReview)
stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
reviewsLemmaStrings.append(stringlemmaTokensFromReview)
# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)
print("negative fake dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()
# replace null with empty string
for token in dfTokens:
if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
dfTokens[token].fillna(value = '', inplace = True)
dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("negative fake dfLemma.head(10): ")
print(dfLemma.head(10).to_string())
negative fake dfTokens.head(10):
0
0 honestly
1 -
2 everything
3 get
4 ruin
5 -
6 -
7 service
8 -
9 -
negative fake dfLemma.head(10):
lem
0 overrate - something - - place irks - - - - - bland martini - - - entry - - banal menu - - - - - - - - - - nt go - - - - nt stand say - - never - - -
1 - - - - - pizza - - guess - dont know - real pizza taste - - - try difaras - thats - message - everyone - give - place - star -
2 maitre - refuse - believe - - - reservation - lunchtime - - kept interrupt - - - try - give - name - - party - - - - meet - - - - asstant try - get - - leave twice - - - let - finish - sentence - say - - - - - meet - - - ask - - - - - sure - - - - reservation - - - - business attire - - - - late lunch business meeting - - - busy - - overrun - - - never - - - rude interaction - - restaurant - - entire life - - sushi - fresh - portion small - - slew - wait staff - - much well - maitre - - - intrusive - - constantly interrupt - flow - conversation - - - - - west coast - use - fresh sushi - - - - willing - put - - - attitude - - fresh fish - - - wonder - - - chauvinistic towards woman - - - - woman - - - reservation - - get slightly well treatment - - - maitre - finally look - - reservation - - - - able - say - party - first name - - - get cut - - - - told - - - reservation - - name - - finally blurt - - party - last name - - - spell - - - - - - finally found - reservation - - - - finally show - - table - - - - hang - coat - even - - - - - patron - offer - option - - - say - - - - - - business meeting - - - - bad reception - - maitre - - - assistant - - - - - greet - guest - look - reservation - - - never return - -
3 - want - dine - amy ruth - - - last - year - finally - - opportunity yesterday - - - horrendous - - - uninspired cornbread - taste exactly - - come - - - jiffy cornbread box - - - crabcake - - chock full - artificial crabmeat - - - chicken wing - - order - - appetizer - - - - flavor - - wing - order - - local chinese restaurant - - entire meal - - disaster - - - parent - - south carolina - - spent - summer - - youth - - south - - - know southern cuisine - - - - - - - - inclined - believe - - food - - appreciate - people - - nt know anything - southernsoul cooking - - potato salad - watery - - sweet tea - nt - - - - taste - - - - brown sugar water - - - nt taste - tea - - boyfriend - short rib - - wierd color gravy - - - - - nt bad - - - - nt good - - - - say - sweet potato - - - nt yam - - ok - - guy - - door - - nice - allow - - - seat - - din partner park - car - - server - - nice - - - go - look - - authentic - southern cuisine - - - - good soul food - - past - - - - sunday dinner - - mom - house - week - - amy ruth - - compare - - - live - - - - repuation - - food - - mass produce mess - - -
4 service - - place - horrendous - - - nt - - - occasion - - - try - look past - terrible treatment - - - xiaolong bao - soup dumpling - - - - last visit - - never go back - - teacup - sticky - - - - - lipstick - - - - - - wait - minute - - appetizer - - - - - - - - group - people - - restaurant - - food - - good enough - tolerate - extremely poor service - filthy drink ware -
5 - place - typical - - village bar - boring yuppy clientele - - bridge - tunnel - type - nj - li - - look - - - type - write good review - - place - - - night - go - - played great music - include joy division - blonde redhead - arcade fire - - lcd soundsystem - - - course - crowd - - - - - nt know - - appreciate - music - - people - work - seem - know - - - - - - term - clientele - - place - infest - culturallyclueless buttondownshirt type - girl - - nt exactly looker - - place - nt big - - - - - room - dance - - - course nobody - dance - - - action - - see - drunk ugly couple make - - full view - - bar - - - - - lame type - person - actually enjoys - type - bar - head - - - east village - brooklyn - get - taste - - real nightlife experience -
6 - nt really - go - - place - - open till - - - friend - - - roast pork - duck - -
7 first - - - - real name - jorge menendez estebanzarzuela - - - travel - bogota - bistro - serf - bad chimichurri sauce - - ever taste - even bad - - ex wifesister - - come - - brother - - - - see - - - - operacion masacre - - - - - - - - memory - drown - - authentic chipotle - - ancestry - - - mango margarita - fantastic - - server - - cute red head - polish decent - - - proud people - - deliver - pitcher - great smile - - - - - - bunuelos - - taco de pescado - - french fry - - octopus - - - - - - menu - - - - - wonderful - - - - - recommend - - - taste - travel - - brother - - eat - many year - love - - - - - back - - - - different restaurant - - similar contextual - love always - roger
8 - expect much - base - - recomendations - palma - - - start - - good - - atmosphere - great - - even - - little garden - - back - - service - ok - - - reason - visit - - good food - - - - disappointed - - meal - - bland - - end - take - couple bite - send - away - - - nt recommend go - palma - dinner - - - - - want - nice place - go - - glass - wine - - nice set - - great - - -
9 - ownermain cook - - falafel cart go - - bagel store - - street - insult - owner - pretend - speak loud broken korean - - - minute - - - time - turn - - friend - say - - - get away - - - - - - customer gon na - - beat - - - - - food - good - - - - boycotting - place - - - grandmother - life - - - exactly - happen -
# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()
print("Get just the first row in each negative fake lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()
print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
dfPartofSpeech = dfWords[dfWords['pos'] == pos]
print()
print("POS_TYPE:", pos)
print(dfPartofSpeech.head(10).to_string())
Group by all lemma'd words, add count and sort:
Get just the first row in each negative fake lemma'd group
dfWords.head(10):
lem index token stem pos counts
0 food 197 food food NN 1215
1 nt 9 nt nt NN 1189
2 place 2 place place NN 925
3 go 10 go go VB 847
4 get 44 get get VB 792
5 order 174 ordered order VB 668
6 good 227 good good JJ 615
7 come 163 came came VB 605
8 restaurant 66 restaurant restaur NN 575
9 service 268 service servic NN 570
Top 10 words by part of speech used in real reviews:
POS_TYPE: NN
lem index token stem pos counts
0 food 197 food food NN 1215
1 nt 9 nt nt NN 1189
2 place 2 place place NN 925
8 restaurant 66 restaurant restaur NN 575
9 service 268 service servic NN 570
10 time 529 time time NN 521
11 wait 74 wait wait NN 516
13 table 129 table tabl NN 495
26 friend 398 friends friend NN 315
27 minute 291 minutes minut NN 305
POS_TYPE: JJ
lem index token stem pos counts
6 good 227 good good JJ 615
15 bad 139 bad bad JJ 408
30 great 327 great great JJ 269
42 much 76 much much JJ 215
60 many 460 many mani JJ 160
63 small 72 small small JJ 155
66 nice 236 nice nice JJ 152
70 last 121 last last JJ 145
71 new 712 new new JJ 144
78 next 908 next next JJ 133
POS_TYPE: VB
lem index token stem pos counts
3 go 10 go go VB 847
4 get 44 get get VB 792
5 order 174 ordered order VB 668
7 come 163 came came VB 605
12 say 13 saying say VB 511
16 make 372 making make VB 405
18 ask 52 asked ask VB 382
21 eat 459 eaten eaten VB 362
23 take 494 taking take VB 358
24 give 27 gave gave VB 327
POS_TYPE: RB
lem index token stem pos counts
14 back 286 back back RB 445
17 well 77 better better RB 399
19 even 132 even even RB 373
20 never 14 never never RB 364
22 really 393 really realli RB 358
43 first 111 first first RB 211
53 also 1032 also also RB 181
59 ever 415 ever ever RB 162
80 long 2806 long long RB 133
84 away 498 away away RB 128
# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative FakeReview flatTokensList[:10]:", flatTokensList[:10])
print()
freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative FakeReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()
freqDist.plot(30, cumulative=False)
print()
#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True) # sorted list
print("Negative FakeReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
freqDist2.plot(30, cumulative=False)
Negative FakeReview flatTokensList[:10]: ['overrated', '', 'something', 'about', 'this', 'place', 'irks', 'me', '', 'was']
Negative FakeReview Frequency Distribution of all words[:30]: [('the', 8101), ('and', 4719), ('i', 4506), ('to', 4150), ('a', 3475), ('was', 3226), ('it', 2349), ('of', 2242), ('we', 1937), ('for', 1825), ('in', 1762), ('that', 1633), ('is', 1625), ('not', 1426), ('this', 1367), ('but', 1337), ('my', 1271), ('food', 1199), ('nt', 1189), ('they', 1109), ('with', 1073), ('were', 1072), ('you', 1019), ('at', 1009), ('on', 998), ('had', 953), ('have', 886), ('place', 813), ('s', 805), ('so', 791)]
Negative FakeReview Frequency Distribution of lemma[:30]: [('food', 1215), ('nt', 1189), ('place', 925), ('go', 847), ('get', 792), ('order', 668), ('good', 615), ('come', 605), ('restaurant', 575), ('service', 570), ('time', 521), ('wait', 516), ('say', 511), ('table', 495), ('back', 445), ('bad', 408), ('make', 405), ('well', 399), ('ask', 382), ('even', 373), ('never', 364), ('eat', 362), ('really', 358), ('take', 358), ('give', 327), ('want', 321), ('friend', 315), ('minute', 305), ('try', 302), ('people', 294)]
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegFakeRevLemma.csv", encoding = 'utf-8', index = False, header = False)