# setup
#!pip install -q wordcloud
import wordcloud

import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string


from collections import Counter


# check if gpu available for processing
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

import tensorflow as tf
tf.config.list_physical_devices('GPU')
tf.test.is_built_with_cuda()

['/device:CPU:0']

True


# constants and strings
# POS (Parts Of Speech) for: nouns, adjectives, verbs and adverbs
DI_POS_TYPES = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'} 
POS_TYPES = list(DI_POS_TYPES.keys())

# constraints on tokens
MIN_STR_LEN = 2
RE_VALID = '[a-zA-Z0-9]'

# constraints for memory usage
ITER = 0
SAMPLE_STEP = 5


# read data from source
review_col_list = ["UserID", "ProductID", "Date", "Review"]
dfReviews = pd.read_csv("../YelpData/YelpNYC/ReviewMap.csv", usecols=review_col_list)
ratings_col_list = ["UserID", "ProductID", "StarRating"]
dfRatings = pd.read_csv("../YelpData/YelpNYC/starRatingMap.csv", usecols=ratings_col_list)
ratings_col_list = ["UserID", "ProductID", "FakeReview"]
dfMeta = pd.read_csv("../YelpData/YelpNYC/metaData.csv", usecols=ratings_col_list)

dfReviews = pd.merge(dfReviews, dfRatings, how = 'inner', left_on = ["UserID", "ProductID"], right_on = ["UserID", "ProductID"])
print(df.describe())
print(df.head(10))

df = pd.merge(dfMeta, dfReviews, how = 'inner', left_on = ["UserID", "ProductID"], right_on = ["UserID", "ProductID"])
print(df.describe())
print(df.head(10))

stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# remove accents function
def remove_accents(data):
    return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")

              UserID      ProductID     FakeReview
count  359052.000000  359052.000000  359052.000000
mean    53992.205533     459.929601       0.794542
std     45806.707721     259.923732       0.607210
min       923.000000       0.000000      -1.000000
25%     13840.000000     247.000000       1.000000
50%     40523.000000     468.000000       1.000000
75%     87314.000000     672.000000       1.000000
max    161147.000000     922.000000       1.000000
   UserID  ProductID        Date  \
0     923          0   12/8/2014   
1     924          0   5/16/2013   
2     925          0    7/1/2013   
3     926          0   7/28/2011   
4     927          0   11/1/2010   
5     928          0    9/2/2009   
6     929          0   8/25/2009   
7     930          0   5/20/2007   
8     931          0  12/27/2005   
9     932          0    5/9/2014   

                                              Review  FakeReview  
0  The food at snack is a selection of popular Gr...          -1  
1  This little place in Soho is wonderful. I had ...          -1  
2  ordered lunch for 15 from Snack last Friday.  ...          -1  
3  This is a beautiful quaint little restaurant o...          -1  
4  Snack is great place for a  casual sit down lu...          -1  
5  A solid 4 stars for this greek food spot.  If ...          -1  
6  Let me start with a shout-out to everyone who ...          -1  
7  Love this place!  Try the Chicken sandwich or ...          -1  
8  My friend and I were intrigued by the nightly ...          -1  
9  Stopped in for lunch today and couldn't believ...          -1  
              UserID      ProductID     FakeReview     StarRating
count  359052.000000  359052.000000  359052.000000  359052.000000
mean    53992.205533     459.929601       0.794542       4.025871
std     45806.707721     259.923732       0.607210       1.055061
min       923.000000       0.000000      -1.000000       1.000000
25%     13840.000000     247.000000       1.000000       4.000000
50%     40523.000000     468.000000       1.000000       4.000000
75%     87314.000000     672.000000       1.000000       5.000000
max    161147.000000     922.000000       1.000000       5.000000
   UserID  ProductID  FakeReview        Date  \
0   30262        468           1  10/20/2004   
1  107234        510           1   11/2/2004   
2   19015        142           1   12/9/2004   
3  116117        708           1    3/2/2005   
4   59929        454           1    3/7/2005   
5   12087        482           1   3/11/2005   
6   88647        444           1   3/13/2005   
7   25179         80           1   3/19/2005   
8    4912        120           1   3/24/2005   
9   25178        363           1   3/31/2005   

                                              Review  StarRating  
0  Excellent Soup Dumplings. It's a must if you g...           4  
1  One of the best hidden no-name neighborhood pl...           4  
2  Really lovely Italian food, very simple and we...           5  
3  Mario Batali at his best, this is my current f...           5  
4  Best place for brunch if you can handle the wa...           5  
5  This cozy, causal restaurant is localed in the...           3  
6  Take a bottle of wine, order the mussels, soak...           5  
7  moto is circa 1938, dusky mirrors and heavy cu...           5  
8  after all the hype i gotta say that some of it...           3  
9  If you want to feel like you're in the middle ...           5


# build sentiment into table
df['Sentiment'] = df['StarRating'].map({1 : -1, 2 : -1, 3 : 0, 4 : +1, 5 : +1})
print(df.head(10))

#separate sentiment into 3 separate frames
dfPositive = df[df['Sentiment'] == 1]
dfNeutral = df[df['Sentiment'] == 0]
dfNegative = df[df['Sentiment'] == -1]
print(dfPositive.describe())
print(dfNeutral.describe())
print(dfNegative.describe())

   UserID  ProductID  FakeReview        Date  \
0   30262        468           1  10/20/2004   
1  107234        510           1   11/2/2004   
2   19015        142           1   12/9/2004   
3  116117        708           1    3/2/2005   
4   59929        454           1    3/7/2005   
5   12087        482           1   3/11/2005   
6   88647        444           1   3/13/2005   
7   25179         80           1   3/19/2005   
8    4912        120           1   3/24/2005   
9   25178        363           1   3/31/2005   

                                              Review  StarRating  Sentiment  
0  Excellent Soup Dumplings. It's a must if you g...           4          1  
1  One of the best hidden no-name neighborhood pl...           4          1  
2  Really lovely Italian food, very simple and we...           5          1  
3  Mario Batali at his best, this is my current f...           5          1  
4  Best place for brunch if you can handle the wa...           5          1  
5  This cozy, causal restaurant is localed in the...           3          0  
6  Take a bottle of wine, order the mussels, soak...           5          1  
7  moto is circa 1938, dusky mirrors and heavy cu...           5          1  
8  after all the hype i gotta say that some of it...           3          0  
9  If you want to feel like you're in the middle ...           5          1  
              UserID      ProductID     FakeReview     StarRating  Sentiment
count  276407.000000  276407.000000  276407.000000  276407.000000   276407.0
mean    54546.945714     458.374734       0.799180       4.510685        1.0
std     45782.341084     260.028822       0.601093       0.499887        0.0
min       923.000000       0.000000      -1.000000       4.000000        1.0
25%     14302.500000     247.000000       1.000000       4.000000        1.0
50%     41479.000000     468.000000       1.000000       5.000000        1.0
75%     87907.500000     671.000000       1.000000       5.000000        1.0
max    161147.000000     922.000000       1.000000       5.000000        1.0
              UserID     ProductID    FakeReview  StarRating  Sentiment
count   47646.000000  47646.000000  47646.000000     47646.0    47646.0
mean    43767.620661    461.180876      0.866809         3.0        0.0
std     42529.124524    259.768302      0.498645         0.0        0.0
min       923.000000      0.000000     -1.000000         3.0        0.0
25%      9524.000000    247.000000      1.000000         3.0        0.0
50%     27353.000000    465.000000      1.000000         3.0        0.0
75%     68520.250000    672.000000      1.000000         3.0        0.0
max    161134.000000    922.000000      1.000000         3.0        0.0
              UserID     ProductID    FakeReview    StarRating  Sentiment
count   34999.000000  34999.000000  34999.000000  34999.000000    34999.0
mean    63530.378097    470.505843      0.659533      1.593588       -1.0
std     47690.882281    259.055087      0.751686      0.491170        0.0
min       923.000000      0.000000     -1.000000      1.000000       -1.0
25%     20004.500000    251.000000      1.000000      1.000000       -1.0
50%     54501.000000    468.000000      1.000000      2.000000       -1.0
75%    100593.000000    688.000000      1.000000      2.000000       -1.0
max    161122.000000    922.000000      1.000000      2.000000       -1.0


# build fake/real into separate frames
dfPosFake = dfPositive[dfPositive['FakeReview'] == -1]
dfNegFake = dfNegative[dfNegative['FakeReview'] == -1]
dfPosReal = dfPositive[dfPositive['FakeReview'] == 1]
dfNegReal = dfNegative[dfNegative['FakeReview'] == 1]
print(dfPosReal.describe())
print(dfNegReal.describe())
print(dfPosFake.describe())
print(dfNegFake.describe())

              UserID      ProductID  FakeReview     StarRating  Sentiment
count  248653.000000  248653.000000    248653.0  248653.000000   248653.0
mean    51907.433777     457.944244         1.0       4.499294        1.0
std     44874.485078     260.363084         0.0       0.500001        0.0
min       937.000000       0.000000         1.0       4.000000        1.0
25%     13409.000000     247.000000         1.0       4.000000        1.0
50%     38068.000000     468.000000         1.0       4.000000        1.0
75%     82866.000000     672.000000         1.0       5.000000        1.0
max    161147.000000     922.000000         1.0       5.000000        1.0
              UserID     ProductID  FakeReview    StarRating  Sentiment
count   29041.000000  29041.000000     29041.0  29041.000000    29041.0
mean    58941.541820    470.547226         1.0      1.633002       -1.0
std     46871.256419    259.069635         0.0      0.481994        0.0
min       940.000000      0.000000         1.0      1.000000       -1.0
25%     16825.000000    251.000000         1.0      1.000000       -1.0
50%     47844.000000    468.000000         1.0      2.000000       -1.0
75%     93886.000000    688.000000         1.0      2.000000       -1.0
max    161122.000000    922.000000         1.0      2.000000       -1.0
              UserID     ProductID  FakeReview    StarRating  Sentiment
count   27754.000000  27754.000000     27754.0  27754.000000    27754.0
mean    78194.800497    462.231570        -1.0      4.612741        1.0
std     47030.095529    256.987184         0.0      0.487133        0.0
min       923.000000      0.000000        -1.0      4.000000        1.0
25%     37674.250000    247.000000        -1.0      4.000000        1.0
50%     78376.500000    468.000000        -1.0      5.000000        1.0
75%    118765.750000    666.000000        -1.0      5.000000        1.0
max    161047.000000    922.000000        -1.0      5.000000        1.0
              UserID    ProductID  FakeReview   StarRating  Sentiment
count    5958.000000  5958.000000      5958.0  5958.000000     5958.0
mean    85897.681605   470.304129        -1.0     1.401477       -1.0
std     45272.502481   259.005809         0.0     0.490238        0.0
min       923.000000     1.000000        -1.0     1.000000       -1.0
25%     48264.250000   250.250000        -1.0     1.000000       -1.0
50%     88427.500000   466.000000        -1.0     1.000000       -1.0
75%    126741.750000   688.000000        -1.0     2.000000       -1.0
max    161111.000000   922.000000        -1.0     2.000000       -1.0


# process reviews by removing stopwords
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in df['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

dfTokens.head(10): 
       0
0  great
1   food
2      -
3  great
4  drink
5      -
6      -
7      -
8   even
9   pair

dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  lem
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  - food - snack - - selection - popular greek dish - - appetizer tray - good - - - greek salad - - - underwhelmed - - main course - - - - table - - - - sometimes hard - get seat -
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         - solid - star - - greek food spot - - - - - fan - lamb - - - - - - come - - try - lamb sandwich - amazingly tender - juicy - onion - arugula - also - - good greek salad -
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         pretty cool place - good food - good people
3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     - - - braise lamb sandwich - - - - - best sandwich - - life - - - - favour - try - place - friendly service - cosy atmosphere -
4  - good big greek cooking - - come - city - - gorgeous sunday - - brutal winter - - - first clear sunny crisp sunday - walk - soho - - - fav - - - - - - - hungry - decide - try - hole - - wall gem - literally - hole - - wall - - think - perfect - believe - - - table - - - - small - restroom - - - hall - - food - delicious - - - hummus - warm pita - lamb stew - fresh - - perfect - pastitsio - sp - - - perfect - portion - - enough - - dim - light lit candle - - - - perfect way - end - sunday - full tummy - wine - - real gem - - service - good hard - - - - - - small place - - feel - - - someone - home - - guest - - cooking - - home good -
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        - food - amaze - - service - equally amaze - - friend - - - definitely come back - - place -
6                                                                                                                                                                                                                                                                                                                                                   - - - - - notice - - - - - review - - - - - healthiest eater - - - - try - - snack - - - best greek salad - - ever taste - big juicy tomato - crunchy fresh cucumber - fantastic olive oil dress - - - nt eat greek salad typcially - - - - eat - - snack - actually - - crave - - visting new york - - - try - -
7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            - taramosalata - - die - - - recommend - shrimp santorini - also - - good friend - - greek love - restaurant - say - taste - authentic -
8                                                                                                - tiny cafe - thompson - - - favorite - mine - year - - - - tell - - - everything - fresh - - attention - detail make - - keeper - - lamb sammie - ciabatta - melt - - mouth chunk - lamb - - roast onion - pretty much - die - - - - big enough - - gal - share - - sure - - favorite soup - - time - - rock - - - avgolemono - super lemony - perfect - - al dente orzo - serve - toast sliver - olive oil coat fresh bread - - - - take away bag - toss little twist - waxed paper fill - - - jordan almond - pretty adorable - - - - - sucker - - little touch -
9                                                                                                                                                                                                                                                                                                                                                                                                                                              really delicious sandwich - - lamb - - - enormous - - - able - eat - - - meal - tight - - - - - recommend grab - go - definitely - neat block - visit - lunch - - ever get bore - sullivan st - - accept credit card -


# sum of counts
print("Group by lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 words by part of speech:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by lemma'd words, add count and sort:
Get just the first row in each lemma'd group
dfWords.head(10):
     lem  index  token   stem pos  counts
0     nt    163     nt     nt  NN   54697
1  place     41  place  place  NN   50843
2   food      0   food   food  NN   49892
3   good      8   good   good  JJ   48059
4    get     17    get    get  VB   42860
5     go    266  going     go  VB   39381
6  great    389  great  great  JJ   34853
7   come     26   come   come  VB   31299
8  order    458  order  order  NN   30485
9   time    222   time   time  NN   25805

Top 10 words by part of speech:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0           nt    163          nt       nt  NN   54697
1        place     41       place    place  NN   50843
2         food      0        food     food  NN   49892
8        order    458       order    order  NN   30485
9         time    222        time     time  NN   25805
13        wait    438        wait     wait  NN   22995
14         try     27         try      tri  NN   22476
15     service     56     service   servic  NN   21731
17  restaurant    186  restaurant  restaur  NN   19958
19        love    185       loves     love  NN   18815

POS_TYPE: JJ
          lem  index      token    stem pos  counts
3        good      8       good    good  JJ   48059
6       great    389      great   great  JJ   34853
18  delicious     94  delicious  delici  JJ   19434
22       best     49       best    best  JJ   16375
33       nice    338       nice    nice  JJ   14203
34     little    243     little   littl  JJ   14056
45       much    213       much    much  JJ   11175
46      small     90      small   small  JJ   11139
62      fresh    100      fresh   fresh  JJ    9068
80        new    173        new     new  JJ    7892

POS_TYPE: VB
      lem  index    token  stem pos  counts
4     get     17      get   get  VB   42860
5      go    266    going    go  VB   39381
7    come     26     come  come  VB   31299
12   make    201    makes  make  VB   23026
27    say    187     says   say  VB   15619
31  taste    152   tasted  tast  VB   14506
32    fry    876    fried   fri  VB   14325
35   take    239     take  take  VB   13754
39  amaze    133  amazing  amaz  VB   12150
41   give   1093     gave  gave  VB   11647

POS_TYPE: RB
           lem  index       token      stem pos  counts
10      really    255      really    realli  RB   24657
11        well    307      better    better  RB   23251
16        back    140        back      back  RB   21142
20        also     35        also      also  RB   18654
28        even    511        even      even  RB   14871
36  definitely    138  definitely   definit  RB   13372
42      pretty     39      pretty    pretti  RB   11560
59       first     69       first     first  RB    9568
65      always   1977      always     alway  RB    8927
77    friendly     55    friendly  friendli  RB    8072


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

flatTokensList[:10]: ['the', 'food', 'at', 'snack', 'is', 'a', 'selection', 'of', 'popular', 'greek']

Frequency Distribution of all words[:30]:  [('the', 462518), ('and', 284830), ('i', 238418), ('a', 224568), ('to', 172121), ('was', 159774), ('it', 139586), ('of', 134772), ('is', 111390), ('for', 99904), ('in', 91479), ('with', 81477), ('but', 79300), ('that', 73170), ('we', 69666), ('you', 69586), ('this', 64822), ('my', 63984), ('on', 57278), ('s', 54730), ('nt', 54697), ('had', 51391), ('not', 49412), ('were', 49195), ('food', 49084), ('they', 48617), ('good', 47878), ('so', 46517), ('place', 45590), ('have', 42379)]

Frequency Distribution of lemma[:30]:  [('nt', 54697), ('place', 50843), ('food', 49892), ('good', 48059), ('get', 42860), ('go', 39381), ('great', 34853), ('come', 31299), ('order', 30485), ('time', 25805), ('really', 24657), ('well', 23251), ('make', 23026), ('wait', 22995), ('try', 22476), ('service', 21731), ('back', 21142), ('restaurant', 19958), ('delicious', 19434), ('love', 18815), ('also', 18654), ('dish', 16481), ('best', 16375), ('table', 16216), ('eat', 15703), ('sauce', 15684), ('friend', 15674), ('say', 15619), ('even', 14871), ('menu', 14814)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistAllWords.csv", encoding = 'utf-8', index = False, header = False)

dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in positive reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfPositive['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("Positive dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Positive dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

Positive dfTokens.head(10): 
        0
0       -
1      nt
2     say
3  enough
4    good
5   thing
6       -
7       -
8   place
9       -

Positive dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                        lem
0                                                                                                                                                                                                               - solid - star - - greek food spot - - - - - fan - lamb - - - - - - come - - try - lamb sandwich - amazingly tender - juicy - onion - arugula - also - - good greek salad -
1                                                                                                                                                                                                                                                                                                                                               pretty cool place - good food - good people
2                                                                                                                                                                                                                                                           - - - braise lamb sandwich - - - - - best sandwich - - life - - - - favour - try - place - friendly service - cosy atmosphere -
3                                                                                                                                                                                                                            need - quick bite - stop - - - - review - - - really cute - small - - - - roast sandwich - - - good - service - - friendly - - - nice place - break - shopping
4   quick - delicious - fill - - - - hour - shopping - soho - - starve - - - nt accommodate - - - - - first - - - tiny - - - take - number - call - back - - min later - fresh ingredient - - flavor hit - - right note - - pastitsio - delicate - - hummus - creamy - - dolmades werent - dense - - tart - service - - - smile - - - definitely good - try - - mellow saturday afternoon -
5  novelty meet mediterranean meet soho - - place - - squeeze - - - - - seater table - - - - mean squeeze - - - - money spot - lunch - - quick - - go bite - go - - lamb sandwich - - dressing - fantastic flavor pairing - - - - look - something lighter - - - stomach - - wallet - try - soup - - mediterranean sandwich - full - veggie - - great variety - cute location - good food -
6                                                                                         - place - tiny - - think - fit - people max - keep - - mind - consider - come - - - weekend night - - food - great - - set romantic - - recommend come - - - - - - area - - - - open seat - oh word - warn - - - guess - - standard - small shop establishment - nyc - - saw - roach - - wall - -
7                                                                  perfect - - - name implies - great butter bean salad - even well winter soup - perfect - date - - oneonone dinner - - - nt bring - part - - - - - - - - exclusively single table - seat maybe - - together - really - gem - especially - - - nt advertise - - - greek restaurant - - - food - definitely mediterranean -
8                                                                                                                                                                                                                                                                             yums - - - try - carp roe - - - sooooooooooooo good - - - - feta - tomato - - - combination platter - - jam -
9                                                                                                                                                                                                                                                                                       small place big - taste - stop - - - wife - shopping - absolutely delicious - - friendly waitress -


# sum of counts
print("Group by positive lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each positive lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 positive words by part of speech:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by positive lemma'd words, add count and sort:
Get just the first row in each positive lemma'd group
dfWords.head(10):
     lem  index    token   stem pos  counts
0  place     22    place  place  NN   38165
1     nt     64       nt     nt  NN   35926
2   food      3     food   food  NN   35534
3   good     17     good   good  JJ   34960
4    get    353      get    get  VB   31360
5  great    131    great  great  JJ   29575
6     go    111       go     go  VB   28693
7   come      7     come   come  VB   22294
8  order    231  ordered  order  VB   20162
9   time    510     time   time  NN   19299

Top 10 positive words by part of speech:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0        place     22       place    place  NN   38165
1           nt     64          nt       nt  NN   35926
2         food      3        food     food  NN   35534
9         time    510        time     time  NN   19299
13         try      8         try      tri  NN   17078
16        love    242        love     love  NN   16338
18     service     37     service   servic  NN   14922
20  restaurant    200  restaurant  restaur  NN   14355
22       sauce    524      sauces     sauc  NN   12063
23        dish    333      dishes     dish  NN   11573

POS_TYPE: JJ
          lem  index      token    stem pos  counts
3        good     17       good    good  JJ   34960
5       great    131      great   great  JJ   29575
12  delicious     58  delicious  delici  JJ   17378
21       best     30       best    best  JJ   13829
31       nice     53       nice    nice  JJ   10691
32     little    287     little   littl  JJ   10673
47      small     47      small   small  JJ    8115
54      fresh     74      fresh   fresh  JJ    7545
55       much   1379       much    much  JJ    7505
77        new   1295        new     new  JJ    6187

POS_TYPE: VB
      lem  index    token   stem pos  counts
4     get    353      get    get  VB   31360
6      go    111       go     go  VB   28693
7    come      7     come   come  VB   22294
8   order    231  ordered  order  VB   20162
11   make    553    makes   make  VB   17636
15   wait    362   waited   wait  VB   16345
25    eat    265   eating    eat  VB   11418
29  amaze    376  amazing   amaz  VB   11175
33    fry   3261    fried    fri  VB   10611
35    say    530      say    say  VB   10172

POS_TYPE: RB
           lem  index       token      stem pos  counts
10      really     45      really    realli  RB   17895
14        well    178      better    better  RB   16389
17        back     71        back      back  RB   15473
19        also     16        also      also  RB   14859
26  definitely     90  definitely   definit  RB   11334
34        even    177        even      even  RB   10330
50      pretty     20      pretty    pretti  RB    7684
52      always   1301      always     alway  RB    7641
61       first     66       first     first  RB    7107
65    friendly     36    friendly  friendli  RB    6870


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Positive flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Positive Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Positive Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

Positive flatTokensList[:10]: ['a', 'solid', '', 'stars', 'for', 'this', 'greek', 'food', 'spot', '']

Positive Frequency Distribution of all words[:30]:  [('the', 339730), ('and', 215930), ('i', 169996), ('a', 166721), ('to', 121316), ('was', 107807), ('it', 100928), ('of', 100117), ('is', 86846), ('for', 71858), ('in', 68609), ('with', 62532), ('but', 54269), ('you', 54261), ('that', 50960), ('we', 47933), ('this', 47727), ('my', 46778), ('on', 41822), ('s', 40978), ('had', 37770), ('nt', 35926), ('so', 35035), ('food', 34973), ('good', 34810), ('place', 34424), ('they', 34375), ('were', 33137), ('have', 31019), ('not', 30695)]

Positive Frequency Distribution of lemma[:30]:  [('place', 38165), ('nt', 35926), ('food', 35534), ('good', 34960), ('get', 31360), ('great', 29575), ('go', 28693), ('come', 22294), ('order', 20162), ('time', 19299), ('really', 17895), ('make', 17636), ('delicious', 17378), ('try', 17078), ('well', 16389), ('wait', 16345), ('love', 16338), ('back', 15473), ('service', 14922), ('also', 14859), ('restaurant', 14355), ('best', 13829), ('sauce', 12063), ('dish', 11573), ('friend', 11426), ('eat', 11418), ('definitely', 11334), ('menu', 11315), ('chicken', 11196), ('amaze', 11175)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistPosWords.csv", encoding = 'utf-8', index = False, header = False)

dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfNeutral['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("Neutral dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Neutral dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

Neutral dfTokens.head(10): 
         0
0        -
1    pizza
2        -
3        -
4     good
5        -
6        -
7    staff
8        -
9  helpful

Neutral dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   lem
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       - little place - soho - wonderful - - - - lamb sandwich - - glass - wine - - price shock - - - small - serve - - - - - - - - soho - - staff - - - little snotty - rude - - - food - great - - - nt expect worldclass service -
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              nice little greek restaurant - serf authentic greek dish - - nt go - - - look - - gyro - - - type - greek food - - ever - - - - - - small - quaint restaurant - seat - total - - people - - - food - - good - - - - veal - rice dish - - - portion - - - little small -
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  decent mediterranean place - - space - - small - - - dish - fresh - - service - friendly - - little overprice - - opinion - - - - nt understand - - - - extra charge - pita - especially - soup - hummus - beet salad - - fresh - - - large - - - - beet salad - soup - also tasty - hearty - - particular - lentil soup - - - tad - - salty side -
3  - really enjoy - experience - - tiny yet tasty restaurant - - - honest - - first big - - - - - seat almost immediately - - seat restaurant - - try go past pm - - - friend - jennifer - - - - get - fava - start - - - - - tasty - - thicker texture - hummus - - - - get extra pita - free - - - also get - classic greek avgolemon - - - - first time - - traditional dish - - - soup afterwards - - forever compare - - - - jennifer - say - - - miss - chicken - - even - - - - - - flavorful dish - jennifer get - special - - day lamb dish - - - uniquely flavor - - - - opinion - - bit expensive - - quantity - - - appreciate - - - - - - table - - - - rush - - - restaurant - - - - - people start wait - seat - - tiny space - - - automatically feel guilty - stay - long - - - - great food - - - big expensiveo - - - opinion - - - glad - get - chance - try - - -
4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      pro - - food - actually - good con - someone - - nt drink wine - - alcohol - - matter - - picked - wine - - list - - - nt see - - - - - - way - - order - bottle - white - - swear - taste - someone pour - half - bottle - refill - - milwaukee - best - natty lite - also - - service - - - good - pleasant - - - - strange consider - - nt possibly fit - - - people - - entire restaurant -
5                                                                                                                                                                                                                                                                                                                                          peppinos - far superior - - typical neighborhood slice shop - - - nt - destination pizza place - - staff - friendly - - fault - - almost make - - - error - - food - - - - - last - time - - - order pie - - - - - put - wrong topping - - pie - - - - - picky guy - - - annoy - - - pay - - - - - pizza - - - - sure - - issue - - - hope - get - fix - delivery service - inconsistent - delivery time - - minute - - - food - arrive hot - time - - addition - - pie - salad - sandwich - also delicious - especially - pepperoni hero -
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              - - - - twice - - get - spinach ravioli - time - - - really good - - - - - complaint - - - go - yesterday - - pm - - place - filthy - - floor - disgust - - - - wipe crumb - - - seat -
7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       nice pizza restaurant - marguerita pizza - great - - expensive
8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               peppino - make - - solid pizza - - ingredient - top notch - - - dough - use - particularly good - - interior - - family style appeal - great neighborhood place - - - complaint - - - price - - - tad high - - local pizza joint - - small plain pizza - - - - - - small - totonno - -
9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       - - - bad review - previously post - - make - trip - - - - delightfully surprised - - service - friendly - invite - - - pizza - - thin crust - - bit soggy - - - flavor played - - mouth - - perfect concerto - - make - salieri wannabe jealous - brick oven - - family environment add great touch - - real hero - - pizza - - - make - trip - - try - pasta - - - continued


# sum of counts
print("Group by neutral lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each neutral lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 neutral words by part of speech:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by neutral lemma'd words, add count and sort:
Get just the first row in each neutral lemma'd group
dfWords.head(10):
      lem  index    token    stem pos  counts
0      nt     19       nt      nt  NN   10076
1    good     46     good    good  JJ    9165
2    food     17     food    food  NN    7420
3   place      1    place   place  NN    7061
4     get    110      got     got  VB    6521
5      go     32       go      go  VB    5436
6    come    376     came    came  VB    4967
7   order    194  ordered   order  VB    4959
8  really     89   really  realli  RB    4240
9    well    452   better  better  RB    3879

Top 10 neutral words by part of speech:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0           nt     19          nt       nt  NN   10076
2         food     17        food     food  NN    7420
3        place      1       place    place  NN    7061
11        time    127        time     time  NN    3510
13         try    104         try      tri  NN    3334
14     service     22     service   servic  NN    3311
17  restaurant     26  restaurant  restaur  NN    2742
18       table    154      tables     tabl  NN    2673
19        dish     30      dishes     dish  NN    2658
26      friend    108      friend   friend  NN    2346

POS_TYPE: JJ
          lem  index      token    stem pos  counts
1        good     46       good    good  JJ    9165
12      great     18      great   great  JJ    3397
24       nice     23       nice    nice  JJ    2375
25     little      0     little   littl  JJ    2360
32       much   1104       much    much  JJ    2067
42      small     10      small   small  JJ    1811
55  delicious    271  delicious  delici  JJ    1477
58       best    205       best    best  JJ    1456
62        bad    330        bad     bad  JJ    1418
71    overall    868    overall  overal  JJ    1200

POS_TYPE: VB
      lem  index    token   stem pos  counts
4     get    110      got    got  VB    6521
5      go     32       go     go  VB    5436
6    come    376     came   came  VB    4967
7   order    194  ordered  order  VB    4959
10   wait    159  waiting   wait  VB    3517
16   make    235    makes   make  VB    2892
22    say    135     said   said  VB    2522
23  taste    198   tasted   tast  VB    2508
29   give    377     give   give  VB    2192
34   take   1030     took   took  VB    2064

POS_TYPE: RB
           lem  index       token     stem pos  counts
8       really     89      really   realli  RB    4240
9         well    452      better   better  RB    3879
15        back    558        back     back  RB    2906
20      pretty    627      pretty   pretti  RB    2569
21        also     80        also     also  RB    2523
39        even    138        even     even  RB    1873
51  definitely    705  definitely  definit  RB    1558
67       first     97       first    first  RB    1365
69       still   1333       still    still  RB    1335
75       maybe   1021       maybe     mayb  RB    1186


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Neutral flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Neutral Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Neutral Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

Neutral flatTokensList[:10]: ['this', 'little', 'place', 'in', 'soho', 'is', 'wonderful', '', 'i', 'had']

Neutral Frequency Distribution of all words[:30]:  [('the', 70369), ('and', 38237), ('i', 37776), ('a', 33433), ('was', 29465), ('to', 25569), ('it', 22772), ('of', 19388), ('but', 15686), ('for', 15603), ('is', 14668), ('in', 12669), ('that', 11733), ('with', 11172), ('nt', 10076), ('we', 9917), ('not', 9794), ('good', 9143), ('my', 8938), ('were', 8676), ('on', 8646), ('you', 8532), ('this', 8258), ('s', 8055), ('had', 7600), ('food', 7297), ('they', 7111), ('so', 6428), ('place', 6253), ('at', 5934)]

Neutral Frequency Distribution of lemma[:30]:  [('nt', 10076), ('good', 9165), ('food', 7420), ('place', 7061), ('get', 6521), ('go', 5436), ('come', 4967), ('order', 4959), ('really', 4240), ('well', 3879), ('wait', 3517), ('time', 3510), ('great', 3397), ('try', 3334), ('service', 3311), ('back', 2906), ('make', 2892), ('restaurant', 2742), ('table', 2673), ('dish', 2658), ('pretty', 2569), ('also', 2523), ('say', 2522), ('taste', 2508), ('nice', 2375), ('little', 2360), ('friend', 2346), ('chicken', 2261), ('sauce', 2212), ('give', 2192)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistNeutWords.csv", encoding = 'utf-8', index = False, header = False)

dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNeutLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfNegative['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("Negative dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("Negative dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

Negative dfTokens.head(10): 
         0
0        -
1       nt
2   bother
3        -
4        -
5  problem
6        -
7        -
8      wet
9        -

Negative dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              lem
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   - meaning - try - place - - whilehighly recommend - - friend - - - tuna sandwich - good - get terribly sick - word - also - sage tea - nice -
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      stop - - lunch takeout - work - - ask - server - - well - roast vegetable sandwhich - vegetable souvlaki - - reply boastfully - - vegetarian souvlaki - apparently - enjoys tzatziki sauce run - - - pita - - - hand - - - - face - unfortunately - - - - - - taste - nt make - - - mess -
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        - walk halfway - manhattan - - restaurant - - wish - - stayed - home - - - decent food - - - seem - - get - - - - corner deli - - flavor seem bland - - menu cryptic - actual ingredient - - salad - - - give kudos - pleasant wine - attentive - - - overbear service -
3  hmm - - far - - - - impressed - stop - - grab - sandwich - take - - woman - - counter clearly wish - - hurry - - make - decision - - place - order - - give - - total - - - - - want - give - exactly - amount - - - - take - - penny - make - - - cent - - cent - - oh - - penny - - - nt - penny - - - - - - - - - - - suppose - know - - - - - nt accept penny - - nt penny money - - - - take - dime - - - penny - wtf - - give - - quarter instead - - - course - get back - dime - - - - need - - change - - told - - sit - - - - sandwich - - brought - - - ready - fine - - - - - seat - anyway - sat - - - bench - tick - tock - tick - tock - - minute go - - - - - - wonder seriously - long - take - make - bloody sandwich - - - go - - ask - - - turn - - sandwich - - sit - - counter - whole time - - girl - - different - - - - - counter say - - oh - - - - - - - - - employee - talk - - - - - place - - mean - - - sit - day - - bench - - watch - yuppie family pile - dknysporting toddler - - luxury suv - - - think - order - sandwich - - - hungry - - sandwich - - - turn - - - really good - lovely ciabatta bread - fresh ingredient - - order - marinate sandwich - - - allinall delightful - - - alidoro - - half - block away - - think - - give snack place - miss next time -
4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     - food - average pizzeria - - cheap - add - - - fact - - puked - gut - - - bathroom - - - meal - - - - decide - - go back -
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           want - love - - look great come - - fell short - - - much oilgrease - good amount - cheese - sauce - even - crust - - good texture - - - soft - - - crispy - char - - - - flavor - - star - - sure - experienced well slice - - - star - friendly service - - - - - slicesoda recession deal - - nice owner - - say -
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          food - creative - thought provoke - - make sure - eat - - go - portion - ridiculously small - left - feel shortchanged - hungry - - - celery oyster stew - - know - - brooklyn - - - - - potato - oyster cracker filler cost - much - - stew scarcely cover - bottom - - cavernous bowl - conversation - dinner tend - echo - - expose side - say bowl - heard - story twice - dekalb - maybe focus less - furbish - wall - - restaurant - - - - furbish - wall - - bowl - cheapingredientyetfancynamedstew - probably - nt - - next time - p - semiredeeming quality - squash tot - good concept - - - squash - becomes - - - verb - meeting - fork -
7                                                                                                                                                                                                                                                                                            - - - excite - try - place - - - - - - - block away - look packed whenever - walk - - holy schizza - - disapointed - - go - brunch - - busy sunday afternoon - - nt wait - get - food - - - pretty hungry - first - - - mimosa - lukewarm - second - - fry - lukewarm - well - taste - - - - cooked - minute ago - - din room - packed - - - - turnover - - decent - - food fresh - - egg - - omelette - - thick - - egg - fill ratio - - - - - - - - - thinly slice mushroom - - entire omelette - - half inch pancake concoction - egg - - - - service - - linger - meal - coffee - - eventually finish - cup - - see - waiter walk - - - fresh pot - coffee - - - already think - - head - - thanks - - thanks - - - - nt even get - chance - kindly pas - refill - coffee - - walk right - - - fill - lady - cup - go right back - put - pot away - maybe le parisien caught - - - bad day - mediocre food - - average service - nt belong - ny - - - good note - - - really cute - - - decor seem tres french -
8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             - place - - deserve - star - - smoke salmon - old - taste horrible - - english muffin - - egg florentine - soggy - - omelette - greasy - - website make - restaurant seem upscale - - - cramped - shabby - - - - small toilet next - - kitchen - - trust yelp - want - bring - friend - - beautiful french restaurant - brunch - - - disgust - - instead - - - circumstance - - place rate - star -
9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               go - - - second time - - - nt - good - - remember - order - pasta - steak frites - mussel appetizer - - drink - overall - underwhelming - total bill include tip come - - lil - - - - maybe people go - - - - - bistro - - area -


# sum of counts
print("Group by negative lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 negative words by part of speech:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by negative lemma'd words, add count and sort:
Get just the first row in each negative lemma'd group
dfWords.head(10):
       lem  index    token    stem pos  counts
0       nt     43       nt      nt  NN    8346
1     food     54     food    food  NN    6852
2    place      2    place   place  NN    5643
3      get      9      got     got  VB    5069
4       go    141       go      go  VB    5030
5    order     89    order   order  NN    4608
6     good      8     good    good  JJ    4262
7     come    223   coming    come  VB    3914
8  service     73  service  servic  NN    3241
9     time    156     time    time  NN    3218

Top 10 negative words by part of speech:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0           nt     43          nt       nt  NN    8346
1         food     54        food     food  NN    6852
2        place      2       place    place  NN    5643
5        order     89       order    order  NN    4608
8      service     73     service   servic  NN    3241
9         time    156        time     time  NN    3218
10        wait    342        wait     wait  NN    3145
13  restaurant     49  restaurant  restaur  NN    2862
14       table    586       table     tabl  NN    2812
21         try      1         try      tri  NN    2119

POS_TYPE: JJ
       lem  index   token   stem pos  counts
6     good      8    good   good  JJ    4262
23     bad    422     bad    bad  JJ    2051
29   great    222   great  great  JJ    1703
33    much    226    much   much  JJ    1571
57    nice     16    nice   nice  JJ    1129
61   small    264   small  small  JJ    1083
70  little    525  little  littl  JJ     922
73    best    638    best   best  JJ     907
85    many    767    many   mani  JJ     799
92    next    204    next   next  JJ     769

POS_TYPE: VB
     lem  index   token  stem pos  counts
3    get      9     got   got  VB    5069
4     go    141      go    go  VB    5030
7   come    223  coming  come  VB    3914
11   say    160    says   say  VB    2938
17  make     44    make  make  VB    2517
19  take     80    take  take  VB    2326
20   ask     21   asked   ask  VB    2163
22  give     67    give  give  VB    2069
25  want     92  wanted  want  VB    1947
31  know    106    know  know  VB    1589

POS_TYPE: RB
       lem  index   token    stem pos  counts
12    well     23  better  better  RB    2891
15    back    122    back    back  RB    2587
16  really    183  really  realli  RB    2521
18    even    232    even    even  RB    2349
34   never    613   never   never  RB    1555
37    also     13    also    also  RB    1471
46   first    347   first   first  RB    1334
62  pretty    345  pretty  pretti  RB    1065
71   maybe    298   maybe    mayb  RB     921
72   still    649   still   still  RB     916


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Negative Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Negative Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

Negative flatTokensList[:10]: ['been', 'meaning', 'to', 'try', 'this', 'place', 'for', 'a', 'whilehighly', 'recommended']

Negative Frequency Distribution of all words[:30]:  [('the', 53142), ('and', 30653), ('i', 29513), ('to', 24624), ('a', 23473), ('was', 22439), ('it', 16114), ('of', 15044), ('for', 11857), ('we', 11046), ('that', 10699), ('in', 10512), ('is', 10036), ('but', 9389), ('not', 8920), ('this', 8353), ('nt', 8346), ('my', 8013), ('with', 7458), ('were', 7155), ('you', 6863), ('they', 6796), ('food', 6761), ('on', 6700), ('had', 6235), ('at', 6051), ('s', 5612), ('so', 5521), ('have', 5212), ('place', 4960)]

Negative Frequency Distribution of lemma[:30]:  [('nt', 8346), ('food', 6852), ('place', 5643), ('get', 5069), ('go', 5030), ('order', 4608), ('good', 4262), ('come', 3914), ('service', 3241), ('time', 3218), ('wait', 3145), ('say', 2938), ('well', 2891), ('restaurant', 2862), ('table', 2812), ('back', 2587), ('really', 2521), ('make', 2517), ('even', 2349), ('take', 2326), ('ask', 2163), ('try', 2119), ('give', 2069), ('bad', 2051), ('taste', 2034), ('want', 1947), ('eat', 1934), ('people', 1803), ('friend', 1768), ('great', 1703)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dfwords = pd.DataFrame(FreqDistSortedWordList)
dfwords.to_csv("../YelpData/YelpNYC/freqDistNegWords.csv", encoding = 'utf-8', index = False, header = False)

dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfPosReal['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("positive real dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("positive real dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

positive real dfTokens.head(10): 
      0
0  holy
1   god
2     -
3     -
4     -
5  live
6     -
7   nyc
8     -
9     -

positive real dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             lem
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      best place - brunch - - - handle - wait - definitely get - mac - cheese -
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               freeman - - - hyped - belief - - - - bush twin rumor - - - - - - - still good food - - devilsonhorseback - prune stuffed - stilton cheese wrap - bacon - - lovely - - mac - cheese - - - someone - grandma make - - - winebytheglass choice - good - everything - pretty affordable - everyone say - - impossible - get - table - - - - monday - - - place - empty - - waitstaff - really friendly - helpful - -
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                cozy little greek place - love - meze - highly recommend - skordalia - potato garlic puree - - - tzatziki - cucumber yogurt dip - - - also - - - traditional dish - - think - greek food - spanakopitakia - spinach pie - pastitsio - mousaka - - nt forget - greek dessert - yogurt - - honey - - course - - halva - baklava -
3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 pylos honor - breadth - traditional greek cuisine - bring fresh cooking - - region - greece - - elegant - contemporary - comfortable set - - east village - next time - - - new york - - nt miss - hidden jewel - - owner christos - - classy man - - take care - - - - - - - best friend - pylosrestaurantcom
4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    go - - sunday brunch - - - visit - - - - satisfy meal - - egg - toast - home potato - - nice thick bacon - slice - chocolatepeanut butter cream pie share - - - - end - - nice topper - - impressed - - quality buttermilk biscuit - - table - seat - - greatly need - - long wait - make - starve - bit pricey - - wait detract - star - - - - definately recommend - - lazy sunday meal -
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         come - - buffalo wing - stay - - catfish burger - also feature - buffalo - ny favorite - beef - weck - hockey - ever - - - - - tv - good fry - serve - chipotle mayo -
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          expert - polenta - homemade hummus - chicken - brie sandwich - - pretty much everything else - bonnie - - reliably excellent food - - - loses point - - occasional excessively greasy sirloin burger - - - - - fool - - put - chipotle mayo - - - fry - scarf - - - - - also - amaze beer selection -
7                                                                                                                                                                                                                                                                                                                                                                               best espresso - nyc - - - good friend google agrees - - - - - - go - - little - - - - nyc visit friend - - buddy josh - - foodie - - espresso naziconnisseur - show - - new spot - - - tell - - - - want - best espresso - town - - - pay - - cab - let - go - - - - tell - - - walk - - joint - - block away - - - - way - tell - - acidity - foam - temperature - - - - - - sit - - - charm caferestaurant - - thoroughly impressed - - place smell italian - - - tough - explain - old place - europe - - funky smell - - remember forever - - joint - - import - euro stank - authenticity - - work - - - charm - - - pant - - espresso - - best - - ever - - - - drank - share - - - - - acidic - linger - - - mouth - - deep richness - - get home - - - forgotten - name - - amaze cafe - - - know - much - - critic josh - - - google - best espresso nyc - - low - behold - - first result - - quadronno - check - - website - - - see - - sat right - - viking mural -
8  locate - - heart - manhattan - theater district - carmine - - - pack - - - - open - june - - - - - pasta - sound steep - - everything - - primarily - - southern region - italy - - serve family style - - huge platter overflow - food - - need - come - - - empty stomach - - willingness - set aside - diet regime - - day - - carmine - chef - stuff - - - typical italian mother - believe - - - - - waiter - actually advise - - cut back - - feel - - order - much food - - party - - indulge - - hot antipasto - penne - la vodka - - chicken marsala - - brought home enough leftover - - - meal - - carmine - - nt serve - - quantity - everything - fresh - deliciously season - - cooked - order - especially - pasta - - - perfectly al dente - dessert - - delicious - - - - never - room - sample - - - choice - - atmosphere - festive - - - big fat italian wedding - - - - go - - - - - hour - - - pm - - - urgent - - make reservation well - advance - - - mean week - - day - - - otherwise - wait - - table - - long - - accept reservation - - size party - pm - - - pm - - party - - - - - open - pm sunday - monday - midnight - rest - - week - - - - - perfect place - - posttheater supper - - - - nt mind go - bed - - - full stomach - - visit - new york city - complete - - meal - carmine - - - - - recommend - gem enough -
9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       - nt go wrong - - - - burger - long line - poor service - - - beef - good - however - - - best - - city - see - review -


# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by all lemma'd words, add count and sort:
Get just the first row in each negative lemma'd group
dfWords.head(10):
     lem  index     token   stem pos  counts
0  place      1     place  place  NN   34855
1     nt     76        nt     nt  NN   33788
2   good     16      good   good  JJ   32273
3   food     17      food   food  NN   31760
4    get      6       get    get  VB   29120
5  great    661     great  great  JJ   26826
6     go    119      went   went  VB   26333
7   come    163      come   come  VB   21021
8  order    363  ordering  order  VB   19094
9   time    103      time   time  NN   17682

Top 10 words by part of speech used in real reviews:

POS_TYPE: NN
           lem  index        token     stem pos  counts
0        place      1        place    place  NN   34855
1           nt     76           nt       nt  NN   33788
3         food     17         food     food  NN   31760
9         time    103         time     time  NN   17682
13         try    673          try      tri  NN   15837
14        wait      4         wait     wait  NN   15251
16        love     53         love     love  NN   14969
19     service    463      service   servic  NN   13531
20  restaurant    579  restaurants  restaur  NN   12877
22       sauce    824        sauce     sauc  NN   11567

POS_TYPE: JJ
          lem  index      token    stem pos  counts
2        good     16       good    good  JJ   32273
5       great    661      great   great  JJ   26826
12  delicious    396  delicious  delici  JJ   16104
21       best      0       best    best  JJ   12730
32       nice    129       nice    nice  JJ    9830
33     little     50     little   littl  JJ    9745
45      small   1282      small   small  JJ    7572
51      fresh     92      fresh   fresh  JJ    7138
54       much    192       much    much  JJ    7044
77        new    104        new     new  JJ    5627

POS_TYPE: VB
      lem  index     token   stem pos  counts
4     get      6       get    get  VB   29120
6      go    119      went   went  VB   26333
7    come    163      come   come  VB   21021
8   order    363  ordering  order  VB   19094
11   make     30      made   made  VB   16254
31  amaze    213   amazing   amaz  VB    9968
34    say     38      says    say  VB    9525
38   take    114      take   take  VB    8628
50   seat    146   seating   seat  VB    7306
55   give   1669     gives   give  VB    6889

POS_TYPE: RB
           lem  index       token      stem pos  counts
10      really     46      really    realli  RB   16676
15        well    413        well      well  RB   15039
17        back    361        back      back  RB   14215
18        also     65        also      also  RB   13760
24  definitely      5  definitely   definit  RB   10709
35        even    499        even      even  RB    9437
49      pretty     35      pretty    pretti  RB    7324
56      always    629      always     alway  RB    6806
65       first    306       first     first  RB    6434
72    friendly     47    friendly  friendli  RB    6023


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("PosRealReview flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("PosRealReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("PosRealReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

PosRealReview flatTokensList[:10]: ['best', 'place', 'for', 'brunch', 'if', 'you', 'can', 'handle', 'the', 'wait']

PosRealReview Frequency Distribution of all words[:30]:  [('the', 314511), ('and', 200258), ('i', 159158), ('a', 155092), ('to', 112463), ('was', 100859), ('it', 94501), ('of', 93623), ('is', 79486), ('for', 66962), ('in', 63505), ('with', 58824), ('but', 50548), ('you', 49955), ('that', 47654), ('we', 44251), ('this', 44214), ('my', 43657), ('on', 38881), ('s', 38550), ('had', 35069), ('nt', 33788), ('so', 32750), ('they', 32167), ('good', 32137), ('place', 31376), ('food', 31202), ('were', 30978), ('not', 28659), ('have', 28368)]

PosRealReview Frequency Distribution of lemma[:30]:  [('place', 34855), ('nt', 33788), ('good', 32273), ('food', 31760), ('get', 29120), ('great', 26826), ('go', 26333), ('come', 21021), ('order', 19094), ('time', 17682), ('really', 16676), ('make', 16254), ('delicious', 16104), ('try', 15837), ('wait', 15251), ('well', 15039), ('love', 14969), ('back', 14215), ('also', 13760), ('service', 13531), ('restaurant', 12877), ('best', 12730), ('sauce', 11567), ('dish', 11249), ('definitely', 10709), ('eat', 10628), ('friend', 10589), ('menu', 10494), ('chicken', 10245), ('fry', 10192)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosRealRevLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfNegReal['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("negative real dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("negative real dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

negative real dfTokens.head(10): 
            0
0           -
1       first
2  experience
3           -
4           -
5  restaurant
6           -
7         get
8           -
9           -

negative real dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 lem
0                                                                                                                                            middle eastern cuisine - - mediocre - - ok - - - drunk - want something salty - crunchy - - - give kudos - - amount - crap - - - able - magically fit - - pitahummus - falafel - babaganoush - onion - pickle - cabbage - lettuce - saucebut - - walk - - - - - - - time - - stomach ache - - - people - - know - make - wonder - - really - - tahini -
1  - spending - - - - people - include tip - - cheap bottle - wine - - - - - - satisfied - - say - least - instead - - - left - - delicious memory - yesterday - dinner - probably - - - nt - - delicious - definitely - delicious enough - - value - - food - good - - - even close - great - - agree - - previous review - - - - - - hype - - heard - read - - restaurant - - leaf - wonder - - best - - memorable part - - meal - - complementary muffin - - next day - breakfast - - recommend -
2                                               pommes frites make - lot - noise - - - double fry - potato - - - - - suppose - - - - - - - - many oddly cooked - mushyontheinside fry - want - go back - - sauce - definitely worth try - - - staff - always willing - give free sample - - last time - - - - - - - liquor license - - - suggest - small deli next door - alcoholic refreshment - - go several time - try - - - - - - - end - prefer - frites - - cafe du bruxelles - - - le halle -
3                                                                                                                                                                                                                                                                                                                              - - - hype - - - nt get - - - really disliked - place - hamburger - - flavor - - - - bun - unimpressive - burger joint - shake shack - jg melon - blow - place away -
4                                                                                                                                                                                                                                                                                                                                                              - food - - good - - love - design - - din room - - open kitchen area - however - - get - little noisy - - service - - bit haphazard -
5                                                                                                                                                                                                                                                   defintely - unique place especially - - - risotto - - portion - kind - skimpy - - price - - - get - roast chicken - asparagus - pine nut mix - - - tad bland - - sticky - perhaps - - - give - - shot - - different order - - - nt - impressed -
6                                                                                                                                                                                                                                                                                                                                                         overated pizza - uneven balance - cheese - - great - - - - tourist look - - sample - institution - try grimaldi - - - want - real ny pie -
7                                                                                                                                                                                                             mediocre - - - nt see - everyone make - deal - - place - maybe - - order - bowl - berry - - meal - - try - friend meatloaf - - - ok - - heard - - breakfast - horrible - - decor - - - date - trendy - - look - - keep - clean - - service - - good - - - overall - - - nt impressed -
8                                                                                                                                                                                                                                                                                                                                                      - place - overrate - overprice - overhipsterified - - - - give - kid balloon - - - - photo booth - - basement - - nt mean - - worth - visit -
9                                                                                                                                                                                                                                                                                                                                                                                                                                     - - nt - - - - - food suck - - service - wish - go elsewhere -


# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each negative lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by all lemma'd words, add count and sort:
Get just the first row in each negative lemma'd group
dfWords.head(10):
     lem  index   token   stem pos  counts
0     nt     52      nt     nt  NN    7314
1   food     58    food   food  NN    5640
2  place    137   place  place  NN    4849
3    get    134     get    get  VB    4421
4     go     97      go     go  VB    4281
5  order    190   order  order  NN    4074
6   good     59    good   good  JJ    3716
7   come    264    come   come  VB    3465
8   time     26   times   time  NN    2786
9   wait    370  waited   wait  VB    2773

Top 10 words by part of speech used in real reviews:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0           nt     52          nt       nt  NN    7314
1         food     58        food     food  NN    5640
2        place    137       place    place  NN    4849
5        order    190       order    order  NN    4074
8         time     26       times     time  NN    2786
10     service    164     service   servic  NN    2675
13       table    373       table     tabl  NN    2483
14  restaurant     69  restaurant  restaur  NN    2359
23       taste   1400       taste     tast  NN    1766
24        want      6        want     want  NN    1723

POS_TYPE: JJ
       lem  index   token   stem pos  counts
6     good     59    good   good  JJ    3716
25     bad    437     bad    bad  JJ    1686
29   great     62   great  great  JJ    1523
32    much    420    much   much  JJ    1377
57    nice   1026    nice   nice  JJ     980
62   small    114   small  small  JJ     905
70  little    162  little  littl  JJ     807
72    best     72    best   best  JJ     796
86    next     78    next   next  JJ     684
89    many     91    many   mani  JJ     677

POS_TYPE: VB
     lem  index   token  stem pos  counts
3    get    134     get   get  VB    4421
4     go     97      go    go  VB    4281
7   come    264    come  come  VB    3465
9   wait    370  waited  wait  VB    2773
12   say     43     say   say  VB    2522
15  make     31   makes  make  VB    2310
19  take    397    take  take  VB    1924
20   ask    570   asked   ask  VB    1791
21   try    102  trying   tri  VB    1782
22  give     10    give  give  VB    1771

POS_TYPE: RB
       lem  index   token    stem pos  counts
11    well    363    well    well  RB    2531
16  really     33  really  realli  RB    2280
17    back     98    back    back  RB    2170
18    even     60    even    even  RB    2075
37    also    638    also    also  RB    1263
41   never    441   never   never  RB    1189
50   first    374   first   first  RB    1087
56  pretty   1260  pretty  pretti  RB    1029
66   still    433   still   still  RB     853
75    long    371    long    long  RB     779


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative RealReview flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Negative RealReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Negative RealReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

Negative RealReview flatTokensList[:10]: ['middle', 'eastern', 'cuisine', 'that', 's', 'mediocre', '', 'and', 'ok', 'if']

Negative RealReview Frequency Distribution of all words[:30]:  [('the', 45943), ('and', 26254), ('i', 25613), ('to', 20932), ('a', 20391), ('was', 19491), ('it', 14226), ('of', 13363), ('for', 10324), ('we', 9543), ('that', 9441), ('in', 9052), ('is', 8636), ('but', 8392), ('not', 7654), ('nt', 7314), ('this', 7114), ('my', 6921), ('with', 6398), ('were', 6310), ('you', 5946), ('on', 5842), ('they', 5688), ('food', 5568), ('had', 5176), ('at', 5082), ('s', 4904), ('so', 4707), ('have', 4416), ('place', 4193)]

Negative RealReview Frequency Distribution of lemma[:30]:  [('nt', 7314), ('food', 5640), ('place', 4849), ('get', 4421), ('go', 4281), ('order', 4074), ('good', 3716), ('come', 3465), ('time', 2786), ('wait', 2773), ('service', 2675), ('well', 2531), ('say', 2522), ('table', 2483), ('restaurant', 2359), ('make', 2310), ('really', 2280), ('back', 2170), ('even', 2075), ('take', 1924), ('ask', 1791), ('try', 1782), ('give', 1771), ('taste', 1766), ('want', 1723), ('bad', 1686), ('eat', 1583), ('friend', 1544), ('people', 1528), ('great', 1523)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegRealRevLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfPosFake['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("fake dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("fake dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

fake dfTokens.head(10): 
           0
0          -
1      place
2          -
3  recommend
4          -
5          -
6   coworker
7          -
8       come
9          -

fake dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               lem
0  - right - - - - - deal - - reality - - gramercy tavern - - - bar - night - - week - - eat - - tavern - - - - cocktail - great - - food - solid - - - bartender monthursday - - - - best - ever - - - drink whiskey - - seriously drink - leave la vega seriously - get - - flatiron - even - - - - - menu - also - everything - - ever heard - - service - true - - - ny - - nt - - rare - - food - - back - phenomenal - shy away - - rabit - thumper - - thing - - - - inevitably surround - - series - event dinner - anniversary - funeral - first orgasm - - always get - - something - - - meal - leave - - - prix fixe coma - - - - need - entertain - prospective inlaws - eat - front - talk trash - - bar - marvel - - - - move - many people - elegantly - - restuarant - - drink - - - best cocktail - america - - try - rare german wine - - meal - - get - - - - - - - fyi - - - - single joint -
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         - - - favorite place - - city - - pizza - - worth - hour long wait - take - seat - - bar - - - corner cubby - - drink - - - - fine - lombardi - - also - - - - place willing - serve - - magnum - wine - - really decent price - - perfect place - low key night - good friend - - - avoid delivery - - - - disappointed - single time -
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      - place really know - - - - - foodie kind - joint - quail - varities - fish - rabbit - etc - - - tapasstyle serving - great wine list - hard - go wrong - -
3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               - place - great - - people - - tapa plate - - single - - - homerun - sit - watch - food - cooked make - - much well - - nt - eat - cramped tight space - - - make - experience - personal - - - wait - - - hour - - great - - - sent - - corner - bar jamon - - great experience -
4                                                                                                                                                                                                                                                                                                                                                                                                                                                   - order - get tapa - - small table - - need - wait - least - min - - saturday night - - rest assure - - - leave - - satisfied customer - try - - - look - place - - pas - - - - - little hole - - wall - - result - leave - ask - - - - tapa - - tasty especially - patatas fritas - bread wfava bean spread - - pork sandwich - - drink - recommed - white sangria wstrawberries - - finish - night - nice dessert - cake wdulce de leche ice cream - enjoy -
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            wow - - - great restaurant - food - decor - service - top notch - try - rotisserie chicken - - fresh shrimp - - dessert area fantastic - - nt wait - - second visit -
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      feel - southern delight - gotcha - - - - - right place - - gumbo go - - fast - - finger - - - thumbsup - ready - - hot stuff - - cajun martini - - - - make - walk - - wall - - - ceiling - back - - - - side - - - - want - stay - - hangover breakfast - tickle - sore head - - best bloody mary - town -
7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     - - - - numerous time - - taste menu - lunch - - service - - food - consistently excellent lot - small touch - really add - - - superb experience - meal - - little bit - - splurge - - definitely worth - -
8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          - expensive - - - nice - course mealroom - - little loud - service - excellent - - - - place - royal people - huge gorgeous crystal chandelier overhead - - - live roam violinist - enjoy - food - sip - - wine - - hear - - live music
9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      - food - terrific - - dinner - delicious - elegant - brunch - great - classic - super sandwich - great - brunch - family - kid - wonderful service - - - - favorite spot - - neighborhood -


# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each positive fake lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by all lemma'd words, add count and sort:
Get just the first row in each positive fake lemma'd group
dfWords.head(10):
       lem  index    token    stem pos  counts
0     food     12     food    food  NN    3495
1    place     93   places   place  NN    3121
2    great     11    great   great  JJ    3075
3     good    122     good    good  JJ    2521
4       go    146       go      go  VB    2275
5       nt     37       nt      nt  NN    1926
6      get     26      get     get  VB    1733
7  service     34  service  servic  NN    1511
8     love    403     love    love  NN    1466
9     time    128     time    time  NN    1427

Top 10 words by part of speech used in real reviews:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0         food     12        food     food  NN    3495
1        place     93      places    place  NN    3121
5           nt     37          nt       nt  NN    1926
7      service     34     service   servic  NN    1511
8         love    403        love     love  NN    1466
9         time    128        time     time  NN    1427
12  restaurant    234  restaurant  restaur  NN    1286
14         try     83         try      tri  NN    1204
17        wait     99        wait     wait  NN    1123
20       order    179       order    order  NN    1020

POS_TYPE: JJ
          lem  index      token    stem pos  counts
2       great     11      great   great  JJ    3075
3        good    122       good    good  JJ    2521
10       best     16       best    best  JJ    1359
11  delicious    336  delicious  delici  JJ    1307
24       nice    223       nice    nice  JJ     845
43        new    611        new     new  JJ     591
49      fresh    243      fresh   fresh  JJ     568
54     little    199     little   littl  JJ     540
63      small    182      small   small  JJ     453
78       much    160       much    much  JJ     395

POS_TYPE: VB
      lem  index    token  stem pos  counts
4      go    146       go    go  VB    2275
6     get     26      get   get  VB    1733
13   make    159     made  made  VB    1208
15   come    595     come  come  VB    1194
21  amaze    475  amazing  amaz  VB    1015
25    eat      8   eating   eat  VB     810
34    say   1196     said  said  VB     664
37  taste    287  tasting  tast  VB     632
39   take    100     take  take  VB     611
59    fry   1057    fried   fri  VB     495

POS_TYPE: RB
           lem  index       token       stem pos  counts
16        back     40        back       back  RB    1124
18      really    114      really     realli  RB    1115
19        well    161      better     better  RB    1088
23        also     30        also       also  RB     878
27      always     56      always      alway  RB     776
30  definitely    305  definitely    definit  RB     728
31        even     28        even       even  RB     715
33    friendly    820    friendly   friendli  RB     670
47  atmosphere    676  atmosphere  atmospher  RB     576
51        ever     17        ever       ever  RB     558


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Positive FakeReview flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Positive FakeReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Positive FakeReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

Positive FakeReview flatTokensList[:10]: ['all', 'right', '', 'so', 'here', 's', 'the', 'deal', '', 'the']

Positive FakeReview Frequency Distribution of all words[:30]:  [('the', 22041), ('and', 14820), ('i', 10484), ('a', 10136), ('to', 8143), ('is', 7093), ('was', 6140), ('it', 5870), ('of', 5704), ('for', 4617), ('in', 4475), ('with', 3610), ('you', 3475), ('this', 3460), ('food', 3458), ('we', 3059), ('great', 3052), ('but', 3029), ('my', 2927), ('place', 2897), ('that', 2846), ('good', 2509), ('on', 2400), ('had', 2380), ('s', 2362), ('are', 2317), ('they', 2309), ('have', 2279), ('so', 2129), ('very', 1947)]

Positive FakeReview Frequency Distribution of lemma[:30]:  [('food', 3495), ('place', 3121), ('great', 3075), ('good', 2521), ('go', 2275), ('nt', 1926), ('get', 1733), ('service', 1511), ('love', 1466), ('time', 1427), ('best', 1359), ('delicious', 1307), ('restaurant', 1286), ('make', 1208), ('try', 1204), ('come', 1194), ('back', 1124), ('wait', 1123), ('really', 1115), ('well', 1088), ('order', 1020), ('amaze', 1015), ('pizza', 946), ('also', 878), ('nice', 845), ('eat', 810), ('friend', 785), ('always', 776), ('staff', 765), ('menu', 758)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistPosFakeRevLemma.csv", encoding = 'utf-8', index = False, header = False)


# process reviews by removing stopwords in neutral reviews
reviewsTokens = []
reviewsTokenLists = []
reviewsLemmaStrings = []

for review in dfNegFake['Review']:
    ITER += 1
    if ITER % SAMPLE_STEP != 1:
        continue
    
    # tokenize and lowercase each review
    tokens = [word.lower() for sent in nltk.sent_tokenize(review) for word in nltk.word_tokenize(sent)]

    # process tokens in each review
    tokensFromReview = []
    lemmaTokensFromReview = []
    for token in tokens:
        # remove accent marks and punctuation
        newToken = remove_accents(token)
        newToken = str(newToken).translate(string.punctuation)
        tokensFromReview.append(newToken)

        # add representation to Lemma for non-match (will be removed when match found)
        lemmaTokensFromReview.append("-")

        # process newToken to remove stopwords and get word counts
        if newToken not in stopwords:
            if re.search(RE_VALID, newToken):
                if(len(newToken)) >= MIN_STR_LEN:
                    # parse token as part of speech, with default to noun
                    pos = nltk.pos_tag([newToken])[0][1][:2]
                    pos2 = 'n'
                    if pos in DI_POS_TYPES:
                        pos2 = DI_POS_TYPES[pos]
          
                    stem = stemmer.stem(newToken)
                    lem = lemmatizer.lemmatize(newToken, pos = pos2)

                    if pos in POS_TYPES:
                        reviewsTokens.append((newToken, stem, lem, pos))
                        lemmaTokensFromReview = lemmaTokensFromReview[:-1] 
                        lemmaTokensFromReview.append(lem)

    # build reviews lists
    reviewsTokenLists.append(tokensFromReview)
    stringlemmaTokensFromReview = ' '.join(lemmaTokensFromReview)
    reviewsLemmaStrings.append(stringlemmaTokensFromReview)


# build result df
dfTokens = pd.DataFrame(lemmaTokensFromReview)

print("negative fake dfTokens.head(10): ")
print(dfTokens.head(10).to_string())
print()

# replace null with empty string
for token in dfTokens:
    if str(dfTokens[token].dtype) in ('object', 'string_', 'unicode_'):
        dfTokens[token].fillna(value = '', inplace = True)

dfLemma = pd.DataFrame(reviewsLemmaStrings, columns = ['lem'])
print("negative fake dfLemma.head(10): ")
print(dfLemma.head(10).to_string())

negative fake dfTokens.head(10): 
            0
0    honestly
1           -
2  everything
3         get
4        ruin
5           -
6           -
7     service
8           -
9           -

negative fake dfLemma.head(10): 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               lem
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            overrate - something - - place irks - - - - - bland martini - - - entry - - banal menu - - - - - - - - - - nt go - - - - nt stand say - - never - - -
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  - - - - - pizza - - guess - dont know - real pizza taste - - - try difaras - thats - message - everyone - give - place - star -
2  maitre - refuse - believe - - - reservation - lunchtime - - kept interrupt - - - try - give - name - - party - - - - meet - - - - asstant try - get - - leave twice - - - let - finish - sentence - say - - - - - meet - - - ask - - - - - sure - - - - reservation - - - - business attire - - - - late lunch business meeting - - - busy - - overrun - - - never - - - rude interaction - - restaurant - - entire life - - sushi - fresh - portion small - - slew - wait staff - - much well - maitre - - - intrusive - - constantly interrupt - flow - conversation - - - - - west coast - use - fresh sushi - - - - willing - put - - - attitude - - fresh fish - - - wonder - - - chauvinistic towards woman - - - - woman - - - reservation - - get slightly well treatment - - - maitre - finally look - - reservation - - - - able - say - party - first name - - - get cut - - - - told - - - reservation - - name - - finally blurt - - party - last name - - - spell - - - - - - finally found - reservation - - - - finally show - - table - - - - hang - coat - even - - - - - patron - offer - option - - - say - - - - - - business meeting - - - - bad reception - - maitre - - - assistant - - - - - greet - guest - look - reservation - - - never return - -
3                                                                                                                         - want - dine - amy ruth - - - last - year - finally - - opportunity yesterday - - - horrendous - - - uninspired cornbread - taste exactly - - come - - - jiffy cornbread box - - - crabcake - - chock full - artificial crabmeat - - - chicken wing - - order - - appetizer - - - - flavor - - wing - order - - local chinese restaurant - - entire meal - - disaster - - - parent - - south carolina - - spent - summer - - youth - - south - - - know southern cuisine - - - - - - - - inclined - believe - - food - - appreciate - people - - nt know anything - southernsoul cooking - - potato salad - watery - - sweet tea - nt - - - - taste - - - - brown sugar water - - - nt taste - tea - - boyfriend - short rib - - wierd color gravy - - - - - nt bad - - - - nt good - - - - say - sweet potato - - - nt yam - - ok - - guy - - door - - nice - allow - - - seat - - din partner park - car - - server - - nice - - - go - look - - authentic - southern cuisine - - - - good soul food - - past - - - - sunday dinner - - mom - house - week - - amy ruth - - compare - - - live - - - - repuation - - food - - mass produce mess - - -
4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        service - - place - horrendous - - - nt - - - occasion - - - try - look past - terrible treatment - - - xiaolong bao - soup dumpling - - - - last visit - - never go back - - teacup - sticky - - - - - lipstick - - - - - - wait - minute - - appetizer - - - - - - - - group - people - - restaurant - - food - - good enough - tolerate - extremely poor service - filthy drink ware -
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                - place - typical - - village bar - boring yuppy clientele - - bridge - tunnel - type - nj - li - - look - - - type - write good review - - place - - - night - go - - played great music - include joy division - blonde redhead - arcade fire - - lcd soundsystem - - - course - crowd - - - - - nt know - - appreciate - music - - people - work - seem - know - - - - - - term - clientele - - place - infest - culturallyclueless buttondownshirt type - girl - - nt exactly looker - - place - nt big - - - - - room - dance - - - course nobody - dance - - - action - - see - drunk ugly couple make - - full view - - bar - - - - - lame type - person - actually enjoys - type - bar - head - - - east village - brooklyn - get - taste - - real nightlife experience -
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                - nt really - go - - place - - open till - - - friend - - - roast pork - duck - -
7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         first - - - - real name - jorge menendez estebanzarzuela - - - travel - bogota - bistro - serf - bad chimichurri sauce - - ever taste - even bad - - ex wifesister - - come - - brother - - - - see - - - - operacion masacre - - - - - - - - memory - drown - - authentic chipotle - - ancestry - - - mango margarita - fantastic - - server - - cute red head - polish decent - - - proud people - - deliver - pitcher - great smile - - - - - - bunuelos - - taco de pescado - - french fry - - octopus - - - - - - menu - - - - - wonderful - - - - - recommend - - - taste - travel - - brother - - eat - many year - love - - - - - back - - - - different restaurant - - similar contextual - love always - roger
8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     - expect much - base - - recomendations - palma - - - start - - good - - atmosphere - great - - even - - little garden - - back - - service - ok - - - reason - visit - - good food - - - - disappointed - - meal - - bland - - end - take - couple bite - send - away - - - nt recommend go - palma - dinner - - - - - want - nice place - go - - glass - wine - - nice set - - great - - -
9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     - ownermain cook - - falafel cart go - - bagel store - - street - insult - owner - pretend - speak loud broken korean - - - minute - - - time - turn - - friend - say - - - get away - - - - - - customer gon na - - beat - - - - - food - good - - - - boycotting - place - - - grandmother - life - - - exactly - happen -


# sum of counts
print("Group by all lemma'd words, add count and sort:")
dfAllWords = pd.DataFrame(reviewsTokens, columns=['token', 'stem', 'lem', 'pos'])
dfAllWords['counts'] = dfAllWords.groupby(['lem'])['lem'].transform('count')
dfAllWords = dfAllWords.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each negative fake lemma'd group")
dfWords = dfAllWords.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("dfWords.head(10):")
print(dfWords.head(10))
print()

print("Top 10 words by part of speech used in real reviews:")
for pos in POS_TYPES:
    dfPartofSpeech = dfWords[dfWords['pos'] == pos]
    print()
    print("POS_TYPE:", pos)
    print(dfPartofSpeech.head(10).to_string())

Group by all lemma'd words, add count and sort:
Get just the first row in each negative fake lemma'd group
dfWords.head(10):
          lem  index       token     stem pos  counts
0        food    197        food     food  NN    1215
1          nt      9          nt       nt  NN    1189
2       place      2       place    place  NN     925
3          go     10          go       go  VB     847
4         get     44         get      get  VB     792
5       order    174     ordered    order  VB     668
6        good    227        good     good  JJ     615
7        come    163        came     came  VB     605
8  restaurant     66  restaurant  restaur  NN     575
9     service    268     service   servic  NN     570

Top 10 words by part of speech used in real reviews:

POS_TYPE: NN
           lem  index       token     stem pos  counts
0         food    197        food     food  NN    1215
1           nt      9          nt       nt  NN    1189
2        place      2       place    place  NN     925
8   restaurant     66  restaurant  restaur  NN     575
9      service    268     service   servic  NN     570
10        time    529        time     time  NN     521
11        wait     74        wait     wait  NN     516
13       table    129       table     tabl  NN     495
26      friend    398     friends   friend  NN     315
27      minute    291     minutes    minut  NN     305

POS_TYPE: JJ
      lem  index  token   stem pos  counts
6    good    227   good   good  JJ     615
15    bad    139    bad    bad  JJ     408
30  great    327  great  great  JJ     269
42   much     76   much   much  JJ     215
60   many    460   many   mani  JJ     160
63  small     72  small  small  JJ     155
66   nice    236   nice   nice  JJ     152
70   last    121   last   last  JJ     145
71    new    712    new    new  JJ     144
78   next    908   next   next  JJ     133

POS_TYPE: VB
      lem  index    token   stem pos  counts
3      go     10       go     go  VB     847
4     get     44      get    get  VB     792
5   order    174  ordered  order  VB     668
7    come    163     came   came  VB     605
12    say     13   saying    say  VB     511
16   make    372   making   make  VB     405
18    ask     52    asked    ask  VB     382
21    eat    459    eaten  eaten  VB     362
23   take    494   taking   take  VB     358
24   give     27     gave   gave  VB     327

POS_TYPE: RB
       lem  index   token    stem pos  counts
14    back    286    back    back  RB     445
17    well     77  better  better  RB     399
19    even    132    even    even  RB     373
20   never     14   never   never  RB     364
22  really    393  really  realli  RB     358
43   first    111   first   first  RB     211
53    also   1032    also    also  RB     181
59    ever    415    ever    ever  RB     162
80    long   2806    long    long  RB     133
84    away    498    away    away  RB     128


# show frequency for all words
flatTokensList = [y for x in reviewsTokenLists for y in x]
print("Negative FakeReview flatTokensList[:10]:", flatTokensList[:10])
print()

freqDist = nltk.FreqDist(flatTokensList)
del freqDist['']
FreqDistSortedWordList = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Negative FakeReview Frequency Distribution of all words[:30]: ", FreqDistSortedWordList[:30])
print()

freqDist.plot(30, cumulative=False)
print()

#show frequency after removing stop words
flatLemmaList = dfAllWords['lem'].tolist()
freqDist2 = nltk.FreqDist(flatLemmaList)
FreqDistSortedLemmaList = sorted(freqDist2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print("Negative FakeReview Frequency Distribution of lemma[:30]: ", FreqDistSortedLemmaList[:30])
print()
    
freqDist2.plot(30, cumulative=False)

Negative FakeReview flatTokensList[:10]: ['overrated', '', 'something', 'about', 'this', 'place', 'irks', 'me', '', 'was']

Negative FakeReview Frequency Distribution of all words[:30]:  [('the', 8101), ('and', 4719), ('i', 4506), ('to', 4150), ('a', 3475), ('was', 3226), ('it', 2349), ('of', 2242), ('we', 1937), ('for', 1825), ('in', 1762), ('that', 1633), ('is', 1625), ('not', 1426), ('this', 1367), ('but', 1337), ('my', 1271), ('food', 1199), ('nt', 1189), ('they', 1109), ('with', 1073), ('were', 1072), ('you', 1019), ('at', 1009), ('on', 998), ('had', 953), ('have', 886), ('place', 813), ('s', 805), ('so', 791)]

Negative FakeReview Frequency Distribution of lemma[:30]:  [('food', 1215), ('nt', 1189), ('place', 925), ('go', 847), ('get', 792), ('order', 668), ('good', 615), ('come', 605), ('restaurant', 575), ('service', 570), ('time', 521), ('wait', 516), ('say', 511), ('table', 495), ('back', 445), ('bad', 408), ('make', 405), ('well', 399), ('ask', 382), ('even', 373), ('never', 364), ('eat', 362), ('really', 358), ('take', 358), ('give', 327), ('want', 321), ('friend', 315), ('minute', 305), ('try', 302), ('people', 294)]

<AxesSubplot:xlabel='Samples', ylabel='Counts'>


# output frequency data to csv for further analysis in alteryx
dflemma = pd.DataFrame(FreqDistSortedLemmaList)
dflemma.to_csv("../YelpData/YelpNYC/freqDistNegFakeRevLemma.csv", encoding = 'utf-8', index = False, header = False)