In [1]:
# Basic libraries
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import math
import time
import itertools

# Network libraries
import networkx as nx
from fa2 import ForceAtlas2 as FA2
import community

# NLP libraries
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist
from wordcloud import WordCloud, ImageColorGenerator

# Display libraries
from IPython.display import display
from IPython.core.display import display as disp, HTML
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
import imageio

from plotly import __version__
import plotly.io as pio
import plotly.express as px
import plotly.figure_factory as ff
import chart_studio.plotly as py
import plotly.offline
sns.set()

pio.renderers.default = 'notebook'

# The yelp dataset 

The original dataset that we have selected to analyse this problem is the [Yelp dataset](https://www.yelp.com/dataset). This dataset contains a large subset of Yelp's businesses, reviews, check-ins and user-related public anonymous data. It was originally created for the Kaggle _Yelp Dataset Challenge_ which encouraged academics to conduct research or analysis on
the company's subset of data and show their discoveries to the world. 

The original __Yelp Academic Dataset__ contains information about 1.637.138 reviews collected from 192.609 businesses across 10 metropolitan areas in multiple countries of the world. Since this dataset is extremely large to work with (containing __10+ GB of data__), we have subsetted the data further into __one geographical area of interest: Las Vegas, US__, and __one business category: Hotels__. 

*Here we present the filtered dataframes for hotels and their reviews*

---

## The Hotels Dataframe

There are in total **438 hotels** identified by their unique business ids. The name, address, city, state, postal_code, latitude, longitude have been collected. The average stars of the hotels can be seen in the "stars" column, right next to the review_counts. 

In [2]:
business_df = pd.read_csv('./data/las_vegas_all_business.csv')
review_df = pd.read_csv('./data/las_vegas_all_reviews.csv')
keywords = pd.read_csv('./data/las_vegas_business_keywords.csv')

In [3]:
import warnings
warnings.filterwarnings('ignore')
# Filter business_id according to the category
filter_business_id = business_df[business_df.categories.str.contains(r"(Hotels, )|(, Hotels$)", regex = True)==True].business_id
# Filter businesses
business_df = business_df[business_df.business_id.isin(filter_business_id)].reset_index().drop('index', axis = 1).rename({'stars': 'stars_business'})
# Filter reviews based on business_id
review_df = review_df[review_df.business_id.isin(filter_business_id)].reset_index().drop('index', axis = 1).rename({'stars': 'stars_review'})

#columns = ['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       #'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       #'attributes', 'categories', 'hours']
#@interact
#def show_dataframe(column1=columns, column2 = columns, column3 = columns, column4 = columns):
    
    #return business_df[[column1]+[column2]+[column3]+[column4]]

business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,izw7OBVSeNmlVhjly1amQg,Wyndham Vacation Ownership,3475 Las Vegas Blvd S,Las Vegas,NV,89109.0,36.119725,-115.172396,1.5,154,1,"{""BusinessAcceptsCreditCards"": ""True"", ""Restau...","Hotels & Travel, Hotels, Event Planning & Serv...",{}
1,tzlm9Im_8h144lDuIi4o1Q,Extended Stay America - Las Vegas - Midtown,3045 South Maryland Pkwy,Las Vegas,NV,89109.0,36.135074,-115.137917,1.5,48,1,"{""RestaurantsPriceRange2"": ""1"", ""BusinessAccep...","Event Planning & Services, Hotels & Travel, Ho...","{""Monday"": ""0:00-0:00"", ""Tuesday"": ""0:00-0:00""..."
2,mv9CWDyMw5jgFmtrJePN3w,Robertsmith Ranch Resort,6350 W Tara Ave,Las Vegas,NV,89146.0,36.138109,-115.251317,5.0,4,0,"{""RestaurantsPriceRange2"": ""2"", ""WiFi"": ""free""}","Hotels & Travel, Guest Houses, Event Planning ...",{}
3,vchjO0Vs5FwW2tpFnCimqg,Planet Hollywood Towers,"Westgate Resorts, 80 E Harmon Ave",Las Vegas,NV,89109.0,36.124423,-115.146372,3.5,324,0,"{""WiFi"": ""paid"", ""BusinessAcceptsCreditCards"":...","Hotels & Travel, Event Planning & Services, Ho...","{""Monday"": ""9:00-9:00""}"
4,8kWUrcPJNzVsVqt0xtzrdg,Emerald Suites - Convention Center,3684 Paradise Rd,Las Vegas,NV,89109.0,36.122549,-115.153447,1.0,16,1,"{""BusinessAcceptsCreditCards"": ""True"", ""Restau...","Real Estate, Home Services, Hotels & Travel, A...","{""Monday"": ""8:00-20:00"", ""Tuesday"": ""8:00-20:0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,K0U703sJ-Frg-lr2c1JXXA,La Quinta Inn,4975 S Valley View Blvd,Las Vegas,NV,89118.0,36.099568,-115.183582,2.5,6,0,"{""RestaurantsPriceRange2"": ""3"", ""BusinessAccep...","Hotels, Event Planning & Services, Hotels & Tr...",{}
434,VsajJw_iaIvVbxQ0aWCTxg,Vegas Party VIP,"4290 S Cameron St, Ste 3",Las Vegas,NV,89103.0,36.111896,-115.203430,4.5,78,1,"{""RestaurantsGoodForGroups"": ""True"", ""Business...","Party Supplies, Wedding Planning, Hotels, Tick...","{""Monday"": ""0:00-0:00"", ""Tuesday"": ""8:00-23:00..."
435,gf8DvdbgpfRlFQBUsV5gJA,Hampton Inn Las Vegas/Summerlin,7100 Cascade Valley Ct,Las Vegas,NV,89128.0,36.211987,-115.247174,4.0,55,1,"{""WiFi"": ""free"", ""RestaurantsPriceRange2"": ""2""...","Hotels, Hotels & Travel, Event Planning & Serv...","{""Monday"": ""0:00-0:00"", ""Tuesday"": ""0:00-0:00""..."
436,GGoQdKo8QgKfTW23hf4Tuw,MGM Grand Poker Room,3799 Las Vegas Blvd S,Las Vegas,NV,89109.0,36.102437,-115.169351,3.0,65,1,"{""RestaurantsPriceRange2"": ""2"", ""BusinessAccep...","Hotels & Travel, Event Planning & Services, Ho...","{""Monday"": ""0:00-0:00"", ""Tuesday"": ""0:00-0:00""..."


## The Reviews Dataframe

There are in total **172159 reviews** stored in the "text" column and the dates of reviews given stored in the "date" column, together with the ratings the users gave shown in the "stars" column. The usefulness, funiness, and coolness of the reviews are also been rated from a scale of 0 to 5. Moreover, the review, user and business ids have been collected. 

In [4]:
#columns_rev = ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       #'cool', 'text', 'date']
#@interact
#def show_dataframe(column1=columns_rev, column2 = columns_rev, column3 = columns_rev, column4 = columns_rev):
    
    #return review_df[[column1]+[column2]+[column3]+[column4]]
review_df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,NHQWhZve7UEaGLt7IwAyDw,H3wPBzQCKCKiHcV7GZK4kg,LkMtMHVetws5_7QfRjPtlg,2.0,5,3,4,This actually used to be one of my favorite ho...,2011-01-09 00:09:50
1,TkcxD1A4N_9kRZwGhUjDKA,bgG0jJyUXta_kczGrQW2Rw,7EZ4Eu7YJ1ltRCC5jXFJrQ,1.0,1,0,0,There wasn't one specific thing that made this...,2013-04-10 16:19:12
2,vPDLvkNLtyiBTRbt-wIeLw,QIy3iLXjBUBqRzwd2lnWGQ,_ZfjpSEO5ntk-1hbnwCR4g,3.0,5,1,4,I stayed here back in April. Friday-Sunday. It...,2012-08-15 01:33:11
3,CbwmSDCidwUHXRgjaEYJiA,liJS3rpRalcZM4G-uHZLjw,bsrj9_hFAql3dlSf244zpg,2.0,0,1,0,I would have given this place 1 star if not fo...,2012-05-28 19:53:06
4,H-BobmvT1h_4M69SVnvjEw,fmhvtUuQoFTLDH1NRIRNQg,W8-Bsk_hHg5pxbt4EhmPWQ,4.0,0,0,0,Spent only one night here and the main reason ...,2016-01-17 19:42:47
...,...,...,...,...,...,...,...,...,...
172154,N5f9aP2kvhlcv_J1KQm8Bw,Y5QquLnjVLvxZLdOuyI2sQ,El4FC8jcawUVgw_0EIcbaQ,1.0,0,0,0,I'm giving them a one because they deliver the...,2019-12-10 16:14:27
172155,KyhO7OlnxgehmUHwkKa2ew,Zvt8EAzDGM1BwwgfRNwtLQ,El4FC8jcawUVgw_0EIcbaQ,1.0,1,0,0,Plenty of better hotels the chose when coming ...,2019-12-08 18:39:15
172156,gV3OIJrXyklRPpeaEQYeEw,janMsyYEYQV5iNohlEIBlg,RCdWzJI0acxponN70QDRcw,4.0,1,0,0,We chose this hotel because it's pet friendly ...,2019-12-12 07:07:55
172157,iRfzGTDOacY_OBD02gvKJg,GLpxKjxA48jSmK9pH9jr3A,RCdWzJI0acxponN70QDRcw,5.0,0,0,0,"I have stayed at Delano multiple times, it is ...",2019-12-11 21:58:32
