Airbnb New York

Análisis de la distribución y caracterísitcas de las casas y pisos de los datos de Airbnb en New York.

1 – Importamos librerias y visualizamos el dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns

df=pd.read_csv('AB_NYC_2019.csv')
df.head()

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
0	2539	Clean & quiet apt home by the park	2787	John	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	2018-10-19	0.21	6	365
1	2595	Skylit Midtown Castle	2845	Jennifer	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	2019-05-21	0.38	2	355
2	3647	THE VILLAGE OF HARLEM....NEW YORK !	4632	Elisabeth	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	NaN	1	365
3	3831	Cozy Entire Floor of Brownstone	4869	LisaRoxanne	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	2019-07-05	4.64	1	194
4	5022	Entire Apt: Spacious Studio/Loft by central park	7192	Laura	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	2018-11-19	0.10	1	0

df.dtypes

    id                                  int64
    name                               object
    host_id                             int64
    host_name                          object
    neighbourhood_group                object
    neighbourhood                      object
    latitude                          float64
    longitude                         float64
    room_type                          object
    price                               int64
    minimum_nights                      int64
    number_of_reviews                   int64
    last_review                        object
    reviews_per_month                 float64
    calculated_host_listings_count      int64
    availability_365                    int64
    dtype: object

2 – Data wrangling y Data Cleaning

– Vemos a ver cuantosdatos tenemos perdidos:

total = df.isnull().sum().sort_values(ascending=False) #total de valores perdidos por columna ordenados 

#Porcentaje de valores perdidos respecto al total de cada columna
percent = ((df.isnull().sum())*100)/df.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

	Total	Percent
name	16	0.032723
availability_365	0	0.000000
calculated_host_listings_count	0	0.000000
reviews_per_month	0	0.000000
number_of_reviews	0	0.000000
minimum_nights	0	0.000000
price	0	0.000000
room_type	0	0.000000
longitude	0	0.000000
latitude	0	0.000000
neighbourhood	0	0.000000
neighbourhood_group	0	0.000000
host_id	0	0.000000

– Eliminamos las columnas que consideramos poco significantes:

df.drop(['id','host_name','last_review'], axis=1, inplace=True)
df.head(5)

	name	host_id	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365
0	Clean & quiet apt home by the park	2787	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	0.21	6	365
1	Skylit Midtown Castle	2845	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	0.38	2	355
2	THE VILLAGE OF HARLEM....NEW YORK !	4632	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	1	365
3	Cozy Entire Floor of Brownstone	4869	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	4.64	1	194
4	Entire Apt: Spacious Studio/Loft by central park	7192	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	0.10	1	0

– Reemplazamos los NaN de reviews_per_month por 0:

df.fillna({'reviews_per_month':0}, inplace=True)

3 – Exploracion y visualización de los datos

df.neighbourhood_group.unique()

    array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
          dtype=object)

df.neighbourhood.unique()

    array(['Kensington', 'Midtown', 'Harlem', 'Clinton Hill', 'East Harlem',
           'Murray Hill', 'Bedford-Stuyvesant', "Hell's Kitchen",
           'Upper West Side', 'Chinatown', 'South Slope', 'West Village',
           'Williamsburg', 'Fort Greene', 'Chelsea', 'Crown Heights',
           'Park Slope', 'Windsor Terrace', 'Inwood', 'East Village',
           'Greenpoint', 'Bushwick', 'Flatbush', 'Lower East Side',
           'Prospect-Lefferts Gardens', 'Long Island City', 'Kips Bay',
           'SoHo', 'Upper East Side', 'Prospect Heights',
           'Washington Heights', 'Woodside', 'Brooklyn Heights',
           'Carroll Gardens', 'Gowanus', 'Flatlands', 'Cobble Hill',
           'Flushing', 'Boerum Hill', 'Sunnyside', 'DUMBO', 'St. George',
           'Highbridge', 'Financial District', 'Ridgewood',
           'Morningside Heights', 'Jamaica', 'Middle Village', 'NoHo',
           'Ditmars Steinway', 'Flatiron District', 'Roosevelt Island',
           'Greenwich Village', 'Little Italy', 'East Flatbush',
           'Tompkinsville', 'Astoria', 'Clason Point', 'Eastchester',
           'Kingsbridge', 'Two Bridges', 'Queens Village', 'Rockaway Beach',
           'Forest Hills', 'Nolita', 'Woodlawn', 'University Heights',
           'Gravesend', 'Gramercy', 'Allerton', 'East New York',
           'Theater District', 'Concourse Village', 'Sheepshead Bay',
           'Emerson Hill', 'Fort Hamilton', 'Bensonhurst', 'Tribeca',
           'Shore Acres', 'Sunset Park', 'Concourse', 'Elmhurst',
           'Brighton Beach', 'Jackson Heights', 'Cypress Hills', 'St. Albans',
           'Arrochar', 'Rego Park', 'Wakefield', 'Clifton', 'Bay Ridge',
           'Graniteville', 'Spuyten Duyvil', 'Stapleton', 'Briarwood',
           'Ozone Park', 'Columbia St', 'Vinegar Hill', 'Mott Haven',
           'Longwood', 'Canarsie', 'Battery Park City', 'Civic Center',
           'East Elmhurst', 'New Springville', 'Morris Heights', 'Arverne',
           'Cambria Heights', 'Tottenville', 'Mariners Harbor', 'Concord',
           'Borough Park', 'Bayside', 'Downtown Brooklyn', 'Port Morris',
           'Fieldston', 'Kew Gardens', 'Midwood', 'College Point',
           'Mount Eden', 'City Island', 'Glendale', 'Port Richmond',
           'Red Hook', 'Richmond Hill', 'Bellerose', 'Maspeth',
           'Williamsbridge', 'Soundview', 'Woodhaven', 'Woodrow',
           'Co-op City', 'Stuyvesant Town', 'Parkchester', 'North Riverdale',
           'Dyker Heights', 'Bronxdale', 'Sea Gate', 'Riverdale',
           'Kew Gardens Hills', 'Bay Terrace', 'Norwood', 'Claremont Village',
           'Whitestone', 'Fordham', 'Bayswater', 'Navy Yard', 'Brownsville',
           'Eltingville', 'Fresh Meadows', 'Mount Hope', 'Lighthouse Hill',
           'Springfield Gardens', 'Howard Beach', 'Belle Harbor',
           'Jamaica Estates', 'Van Nest', 'Morris Park', 'West Brighton',
           'Far Rockaway', 'South Ozone Park', 'Tremont', 'Corona',
           'Great Kills', 'Manhattan Beach', 'Marble Hill', 'Dongan Hills',
           'Castleton Corners', 'East Morrisania', 'Hunts Point', 'Neponsit',
           'Pelham Bay', 'Randall Manor', 'Throgs Neck', 'Todt Hill',
           'West Farms', 'Silver Lake', 'Morrisania', 'Laurelton',
           'Grymes Hill', 'Holliswood', 'Pelham Gardens', 'Belmont',
           'Rosedale', 'Edgemere', 'New Brighton', 'Midland Beach',
           'Baychester', 'Melrose', 'Bergen Beach', 'Richmondtown',
           'Howland Hook', 'Schuylerville', 'Coney Island', 'New Dorp Beach',
           "Prince's Bay", 'South Beach', 'Bath Beach', 'Jamaica Hills',
           'Oakwood', 'Castle Hill', 'Hollis', 'Douglaston', 'Huguenot',
           'Olinville', 'Edenwald', 'Grant City', 'Westerleigh',
           'Bay Terrace, Staten Island', 'Westchester Square', 'Little Neck',
           'Fort Wadsworth', 'Rosebank', 'Unionport', 'Mill Basin',
           'Arden Heights', "Bull's Head", 'New Dorp', 'Rossville',
           'Breezy Point', 'Willowbrook'], dtype=object)

df.room_type.unique()

    array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

– Top 10 de los host con más propiedades:

top_host=df.host_id.value_counts().head(10)
top_host

    219517861    327
    107434423    232
    30283594     121
    137358866    103
    12243051      96
    16098958      96
    61391963      91
    22541573      87
    200380610     65
    7503643       52
    Name: host_id, dtype: int64

#setting figure size for future visualizations
sns.set(rc={'figure.figsize':(10,8)})
viz_1=top_host.plot(kind='bar')
viz_1.set_title('Hosts with the most listings in NYC')
viz_1.set_ylabel('Count of listings')
viz_1.set_xlabel('Host IDs')
viz_1.set_xticklabels(viz_1.get_xticklabels(), rotation=45)

Relación ente el distrito y el precio:

– Vamos a echar un rápido de cómo están distribiodos los datos:

red_square = dict(markerfacecolor='salmon', markeredgecolor='salmon', marker='.')

df.boxplot(column='price', by='neighbourhood_group', 
           flierprops=red_square, vert=False, figsize=(10,8))

plt.xlabel('\nMedian Price', fontsize=12)
plt.ylabel('District\n', fontsize=12)
plt.title('\nBoxplot: Prices by Neighbourhood\n', fontsize=14, fontweight='bold')

# get rid of automatic boxplot title
plt.suptitle('')

Vemos que hay bastantes outliers. Vamos a proceder a limitar el margen del precio.

– Extraemos los precios por cada distrito:

#Brooklyn
sub_1=df.loc[df['neighbourhood_group'] == 'Brooklyn']
price_sub1=sub_1[['price']]
#Manhattan
sub_2=df.loc[df['neighbourhood_group'] == 'Manhattan']
price_sub2=sub_2[['price']]
#Queens
sub_3=df.loc[df['neighbourhood_group'] == 'Queens']
price_sub3=sub_3[['price']]
#Staten Island
sub_4=df.loc[df['neighbourhood_group'] == 'Staten Island']
price_sub4=sub_4[['price']]
#Bronx
sub_5=df.loc[df['neighbourhood_group'] == 'Bronx']
price_sub5=sub_5[['price']]

#Metemos todos los df's en una lista
price_list_by_n=[price_sub1, price_sub2, price_sub3, price_sub4, price_sub5]

# Creamos una lista vacia donde se irá añadiendo el precio para cada distrito
p_l_b_n_2=[]

#Creamos una lista con los valores conocidos de los distritos
nei_list=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']

#loop para conocer los estadísticos por cada rango de precio que se guardará en nuestra lista vacia de arriba
for x in price_list_by_n:
    i=x.describe(percentiles=[.25, .50, .75])
    i=i.iloc[3:]
    i.reset_index(inplace=True)
    i.rename(columns={'index':'Stats'}, inplace=True)
    p_l_b_n_2.append(i)
    
#cambiar los nombres de la columna de precios al nombre del área para facilitar la lectura de la tabla   
p_l_b_n_2[0].rename(columns={'price':nei_list[0]}, inplace=True)
p_l_b_n_2[1].rename(columns={'price':nei_list[1]}, inplace=True)
p_l_b_n_2[2].rename(columns={'price':nei_list[2]}, inplace=True)
p_l_b_n_2[3].rename(columns={'price':nei_list[3]}, inplace=True)
p_l_b_n_2[4].rename(columns={'price':nei_list[4]}, inplace=True)

stat_df=p_l_b_n_2
stat_df=[df.set_index('Stats') for df in stat_df]
stat_df=stat_df[0].join(stat_df[1:])
stat_df

	Brooklyn	Manhattan	Queens	Staten Island	Bronx
Stats
min	0.0	0.0	10.0	13.0	0.0
25%	60.0	95.0	50.0	50.0	45.0
50%	90.0	150.0	75.0	75.0	65.0
75%	150.0	220.0	110.0	110.0	99.0
max	10000.0	10000.0	10000.0	5000.0	2500.0

Como hemos visto en los boxplot de arriba, hay varios outliers que nos pueden estar molestando, vamos a eliminarlos.

– Vamos a ver la densidad de la distribución de los precios usando violinplot:

sub_6=df[df.price < 500] #nos quedmoas con los que cuestan menos de 500$

viz_2=sns.violinplot(data=sub_6, x='neighbourhood_group', y='price')
viz_2.set_title('Density and distribution of prices for each neighberhood_group')

Relación de lo Barrios de los distritos frente al tipo de habitaciones

– Top 10 de barrios con propiedades:

df.neighbourhood.value_counts().head(10)

    Williamsburg          3920
    Bedford-Stuyvesant    3714
    Harlem                2658
    Bushwick              2465
    Upper West Side       1971
    Hell's Kitchen        1958
    East Village          1853
    Upper East Side       1798
    Crown Heights         1564
    Midtown               1545
    Name: neighbourhood, dtype: int64

#grabbing top 10 neighbourhoods for sub-dataframe
sub_7=df.loc[df['neighbourhood'].isin(['Williamsburg','Bedford-Stuyvesant','Harlem','Bushwick',
                 'Upper West Side','Hell\'s Kitchen','East Village','Upper East Side','Crown Heights','Midtown'])]

#using catplot to represent multiple interesting attributes together and a count
viz_3=sns.catplot(x='neighbourhood', hue='neighbourhood_group', col='room_type', data=sub_7, kind='count')
viz_3.set_xticklabels(rotation=90)

4 – Mapas

– Mapa de precios:

viz_4=sub_6.plot(kind='scatter', x='longitude', y='latitude', label='availability_365', c='price',
                  cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, figsize=(10,8))
viz_4.legend()

import urllib

plt.figure(figsize=(10,8))

i=urllib.request.urlopen('https://upload.wikimedia.org/wikipedia/commons/e/ec/Neighbourhoods_New_York_City_Map.PNG')
nyc_img=plt.imread(i)

#scaling the image based on the latitude and longitude max and mins for proper output
plt.imshow(nyc_img,zorder=0,extent=[-74.258, -73.7, 40.49,40.92]) #extent=[izq, drcha, abajo, arriba]
ax=plt.gca()

#using scatterplot again
sub_6.plot(kind='scatter', x='longitude', y='latitude', label='availability_365', c='price', ax=ax, 
           cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, zorder=5)
plt.legend()
plt.show()

Análisis de la columna name

– Extraemos los ‘name’ y los metemos en una lista:

#Creamos una lista vacia
_names_=[]

#tomamos los string de name y los metemos en la lista vacia
for name in df.name:
    _names_.append(name)
    
#Creamos una función que cortará esas cadenas de palabras y las meterá n array
def split_name(name):
    spl=str(name).split()
    return spl

#initializing empty list where we are going to have words counted
_names_for_count_=[]

#obtener la cadena de name de nuestra lista y usamos la función split_name, 
#luego se agrega a la lista anterior
for x in _names_:
    for word in split_name(x):
        word=word.lower() #pasamos a minusculas todo
        _names_for_count_.append(word)

– Usamos Counter para contar las palabras y determinar la más común:

from collections import Counter

_top_25_w=Counter(_names_for_count_).most_common(25)
#_top_25_w=_top_25_w[0:25]

– Metemos nuestro Top25 de palabras en un DataFrame que creamos:

sub_w=pd.DataFrame(_top_25_w)
sub_w.rename(columns={0:'Words', 1:'Count'}, inplace=True)#usamos Counter para contar las palabras y determinar la más común
from collections import Counter

_top_25_w=Counter(_names_for_count_).most_common(25)
#_top_25_w=_top_25_w[0:25]

– Visualizamos todo con un barplot:

viz_5=sns.barplot(x='Words', y='Count', data=sub_w)
viz_5.set_title('Counts of the top 25 used words for listing names')
viz_5.set_ylabel('Count of words')
viz_5.set_xlabel('Words')
viz_5.set_xticklabels(viz_5.get_xticklabels(), rotation=80)

#--------------------- Para poner las etiquetas encima de las barras-----------------------
def add_value_labels(viz_5, spacing=5):
    """Add labels to the end of each bar in a bar chart.

    Arguments:
        ax (matplotlib.axes.Axes): The matplotlib object containing the axes
            of the plot to annotate.
        spacing (int): The distance between the labels and the bars.
    """

    # For each bar: Place a label
    for rect in viz_5.patches:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        # Number of points between bar and label. Change to your liking.
        space = spacing
        # Vertical alignment for positive values
        va = 'bottom'

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Use Y value as label and format number with one decimal place
        label = "{:.1f}".format(y_value)

        # Create annotation
        viz_5.annotate(
            label,                      # Use `label` as label
            (x_value, y_value),         # Place label at end of the bar
            xytext=(0, space),          # Vertically shift label by `space`
            textcoords="offset points", # Interpret `xytext` as offset in points
            ha='center',                # Horizontally center label
            va=va,                      # Vertically align label differently for
            rotation=75)                # positive and negative values.
                                        


# Call the function above. All the magic happens there.
add_value_labels(viz_5)

Análisis de la columna number_of_reviews

top_reviewed_listings=df.nlargest(10,'number_of_reviews')
top_reviewed_listings

	name	host_id	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365
11759	Room near JFK Queen Bed	47621202	Queens	Jamaica	40.66730	-73.76831	Private room	47	1	629	14.58	2	333
2031	Great Bedroom in Manhattan	4734398	Manhattan	Harlem	40.82085	-73.94025	Private room	49	1	607	7.75	3	293
2030	Beautiful Bedroom in Manhattan	4734398	Manhattan	Harlem	40.82124	-73.93838	Private room	49	1	597	7.72	3	342
2015	Private Bedroom in Manhattan	4734398	Manhattan	Harlem	40.82264	-73.94041	Private room	49	1	594	7.57	3	339
13495	Room Near JFK Twin Beds	47621202	Queens	Jamaica	40.66939	-73.76975	Private room	47	1	576	13.40	2	173
10623	Steps away from Laguardia airport	37312959	Queens	East Elmhurst	40.77006	-73.87683	Private room	46	1	543	11.59	5	163
1879	Manhattan Lux Loft.Like.Love.Lots.Look !	2369681	Manhattan	Lower East Side	40.71921	-73.99116	Private room	99	2	540	6.95	1	179
20403	Cozy Room Family Home LGA Airport NO CLEANING FEE	26432133	Queens	East Elmhurst	40.76335	-73.87007	Private room	48	1	510	16.22	5	341
4870	Private brownstone studio Brooklyn	12949460	Brooklyn	Park Slope	40.67926	-73.97711	Entire home/apt	160	1	488	8.14	1	269
471	LG Private Room/Family Friendly	792159	Brooklyn	Bushwick	40.70283	-73.92131	Private room	60	3	480	6.70	1	0

price_avrg=top_reviewed_listings.price.mean()
print('Precio medio por noche: {}'.format(price_avrg))

    Precio medio por noche: 65.4