%matplotlib inline

#Data processing
import pandas as pd
import geopandas as gpd
import numpy as np

#Data Vis
import seaborn as sns
import matplotlib.pyplot as plt
import contextily as ctx
import matplotlib.patches as mpatches

#Analysis Tools
from pysal.lib import weights
from pysal.explore import esda
from splot.esda import moran_scatterplot, lisa_cluster, plot_local_autocorrelation
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Clustering Tools
from sklearn import cluster
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.neighbors import NearestNeighbors

#PCA Tools
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Geographic Data
import osmnx as ox

np.random.seed(123)

C:\Users\ryant\miniconda3\envs\gds\lib\site-packages\spaghetti\network.py:36: FutureWarning: The next major release of pysal/spaghetti (2.0.0) will drop support for all ``libpysal.cg`` geometries. This change is a first step in refactoring ``spaghetti`` that is expected to result in dramatically reduced runtimes for network instantiation and operations. Users currently requiring network and point pattern input as ``libpysal.cg`` geometries should prepare for this simply by converting to ``shapely`` geometries.
  warnings.warn(f"{dep_msg}", FutureWarning)


ds1_link = 'data/neighborhoods.shp'
ds1 = gpd.read_file(ds1_link)
type(ds1)

geopandas.geodataframe.GeoDataFrame


ds1.head()


ds1.shape

(114, 3)


ds1.plot()

<AxesSubplot:>


ds2 = pd.read_excel(open('data/kwb-2017.xls', 'rb'), sheet_name='KWB2017')  #read from excel rather than csv
ds2.head()


ds2.shape

(16667, 111)


ds2[ds2=="."] = np.nan # replace "." as NaN.


ds2_filter = ds2[ds2['gm_naam'] == "'s-Gravenhage"] #filter municipality
ds2_filter = ds2_filter[ds2_filter['recs'] == "Buurt"] #filter resolution
ds2_filter.head()


ds2_filter.shape

(114, 111)


ds2_filter.count().sort_values(ascending = False) #count function is able to process the NaN values

gwb_code_10    114
pst_dekp       114
p_geb          114
a_ste          114
p_ste          114
              ... 
g_ele_2w        45
g_gas_2w        42
g_ele_vw        36
g_gas_vw        35
p_stadsv        30
Length: 111, dtype: int64


ds2_filter.rename(columns = {'regio':'neighb_cbs'}, inplace = True) 
ds2_filter.columns[:5] #check first five columns, to see if it has already been renamed

Index(['gwb_code_10', 'gwb_code_8', 'neighb_cbs', 'gm_naam', 'recs'], dtype='object')


ds_merge = ds1.merge(ds2_filter) #dataset 1 + dataset 2
ds_merge.head()


ds_merge.shape

(114, 113)


# We can also test a random plot (spatial polygons with any column (e.g. 'a_inw')), just to check if all the geometry and indicators are in.


#ds_merge.plot(column='a_inw', categorical=False, 
#          legend=True, linewidth=0.1)


v_count = ds_merge.count().sort_values(ascending = False)
print(v_count)

#Also, export to a dataframe for better scrutiny
pd.DataFrame(v_count).to_csv("data/variables_counts.csv")

neighb_cbs    114
a_tur         114
a_geb         114
p_geb         114
a_ste         114
             ... 
g_ele_2w       45
g_gas_2w       42
g_ele_vw       36
g_gas_vw       35
p_stadsv       30
Length: 113, dtype: int64


metric = ['g_woz','g_afs_hp','g_afs_gs','g_afs_kv','g_afs_sc'] # create list of the variables of interest

#extract variable of interest from the variable_count table
vc_df = pd.DataFrame(v_count).reset_index().rename(columns = {0: "count", "index": "variable"}) #placing it in a dataframe
vc_df[vc_df['variable'].isin(metric)] #filtering


ds_clean = ds_merge[['neighb_cbs','geometry','g_woz','g_afs_hp','g_afs_gs','g_afs_kv','g_afs_sc']]


ds_clean.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 114 entries, 0 to 113
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   neighb_cbs  114 non-null    object  
 1   geometry    114 non-null    geometry
 2   g_woz       107 non-null    object  
 3   g_afs_hp    110 non-null    object  
 4   g_afs_gs    110 non-null    object  
 5   g_afs_kv    110 non-null    object  
 6   g_afs_sc    110 non-null    object  
dtypes: geometry(1), object(6)
memory usage: 7.1+ KB


#convert average home value
pd.set_option('mode.chained_assignment', None) #surpress
ds_clean["g_woz"] =ds_clean["g_woz"].astype("float64")

#convert proximity variables
ds_clean = ds_clean.apply(lambda x: x.replace({',':'.'}, regex=True) ) #replace , with a . (to parse as decimals)
list_to_convert = ['g_afs_hp','g_afs_gs','g_afs_kv','g_afs_sc']
for var in list_to_convert:
    ds_clean[var] = ds_clean[var].astype("float64")


ds_clean.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 114 entries, 0 to 113
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   neighb_cbs  114 non-null    object  
 1   geometry    114 non-null    geometry
 2   g_woz       107 non-null    float64 
 3   g_afs_hp    110 non-null    float64 
 4   g_afs_gs    110 non-null    float64 
 5   g_afs_kv    110 non-null    float64 
 6   g_afs_sc    110 non-null    float64 
dtypes: float64(5), geometry(1), object(1)
memory usage: 7.1+ KB


ds_clean.rename(columns = {"g_woz":"Average home prices",
                           "g_afs_hp":"Dist to GP",
                           "g_afs_gs":"Dist to Supermarket", 
                           "g_afs_kv":"Dist to Daycare",
                           "g_afs_sc":"Dist to School"}, inplace = True)

ds_clean.head()


#ds_clean.dropna(axis=0)


#Filter Rows WITH NaN
ds_clean_nan = ds_clean[ds_clean.isna().any(axis=1)] #selecting rows that have any NaN values in them.

#Merge to get no. of inhabitants data
no_inh = ds2_filter[['neighb_cbs','a_inw']].rename(columns = {'a_inw':'No. of inhabitants'})  #Getting the number of inhabitants from dataset2
ds_inh = ds_clean_nan.merge(no_inh)
ds_inh


# Prepare melted dataframe for plotting.
ds_clean_nan_melt = ds_clean_nan.melt(id_vars = ["neighb_cbs", "geometry"]) #melt
ds_clean_nan_melt = ds_clean_nan_melt[ds_clean_nan_melt["value"].isnull()] #filter data frame to only record polygons with NaN values.

# Setup figure and axis
f, ax = plt.subplots(1, figsize=(7,7))
# Add a base layer on map
( ds_clean.to_crs(epsg=3857)
         .plot(color='grey', alpha = 0.5, ax=ax) )
# Add layer with data of missing neighbourhoods
( ds_clean_nan_melt.to_crs(epsg=3857)
        .plot(column='neighb_cbs', categorical= True, legend=True, linewidth=0.1, ax=ax,)
        .title.set_text("Neighbourhoods with Missing Data") )
# Add base map
ctx.add_basemap(ax=ax, url=ctx.providers.CartoDB.Positron)

# Display
plt.show()


ds_original = ds_clean.copy() #Saving the original non-deleted dataset for future checks

ds_clean = ds_clean[ds_clean["Dist to GP"].notnull()] #114 --> 110
ds_clean_complete = ds_clean[ds_clean["Average home prices"].notnull()] #114 --> 107


print(ds_clean.shape)
print(ds_clean_complete.shape)

(110, 7)
(107, 7)


ds_clean.head()


ds_clean.shape

(110, 7)


# Diagram Plotting
f, ax = plt.subplots(1, 1, figsize = (6,4));
ax = sns.histplot(ds_clean[["Average home prices"]], bins = 10, kde= True);
ax.set_title("Distribution of Average home prices for the neighbourhoods in The Hague");
ax.set_xlabel("x1000 Euros");
ax.set_ylabel("No. of Neighbourhoods");


ds_pivot = ds_clean[["Dist to GP", "Dist to Supermarket", "Dist to Daycare", "Dist to School"]]
ds_pivot = ds_pivot.melt().dropna()

# Diagram Plotting
f, ax = plt.subplots(1, 1, figsize = (7,4));
ax = sns.boxplot(data=ds_pivot, x='variable', y ="value")
plt.xticks(rotation=45);
ax.set_title("Distribution of Proximity to _____ for the neighbourhoods in The Hague");
ax.set_xlabel("Amenities");
ax.set_ylabel("Distance/ km");


def plotChoro(dataframe,column_name, title, ax, v_min = None, v_max = None, cat = False, cmap = "viridis"):
    """
    Arguments
    ------------
        dataframe: DataFrame
        column_name: String
        title: String
        ax: Axis
        v_min: Float; min value in range
        v_max: Float; max value in range
        cat: Boolean; Categorical?
        cmap: String; Colour Map
        
    Returns
    ------------
        pl: plot
    """

    pl = (dataframe
        .to_crs(epsg=3857)
        .plot(column=column_name, categorical=cat, legend=True, linewidth=0.1, ax = ax, vmin = v_min, vmax = v_max, cmap = cmap)
        .title.set_text(title) )
    return pl


#create subplot
fig, axs = plt.subplots(3,2, figsize=(13,13), 
                        #facecolor='w',
                        constrained_layout=True, 
                        sharex=True, sharey=True, 
                        subplot_kw=dict(aspect='equal')
                       )
axs = axs.ravel()

fig.suptitle('Spatial Distribution of Variables of Interest', fontsize=12, y=1.05)

#plot empty areas
for i in [0,2,3,4,5]:
    ds_original.to_crs(epsg=3857).plot(color='gainsboro',ax=axs[i])    

#plot values
plotChoro(ds_clean, "Average home prices", 'Average Home Prices [x1000 Euros]', axs[0])
plotChoro(ds_clean, "Dist to GP", 'Distance to GP [Km]', axs[2] , 0.2, 1.8)
plotChoro(ds_clean, "Dist to Supermarket", 'Distance to Supermarket [Km]', axs[3], 0.2, 1.8)
plotChoro(ds_clean, "Dist to Daycare", 'Distance to Daycare [Km]', axs[4], 0.2, 1.8)
plotChoro(ds_clean, "Dist to School", 'Distance to School [Km]', axs[5], 0.2, 1.8)

#plot amenities
gp = ox.geometries.geometries_from_place('The Hague, NL', tags = {"amenity":"doctors"} )
gp.to_crs(epsg=3857).plot(ax = axs[2], color = "red" ,markersize =2)
spmkt = ox.geometries.geometries_from_place('The Hague, NL', tags = {"shop":"supermarket"} )
spmkt.to_crs(epsg=3857).plot(ax = axs[3], color = "red" ,markersize =2)
daycare = ox.geometries.geometries_from_place('The Hague, NL', tags = {"amenity":"childcare"} )
daycare.to_crs(epsg=3857).plot(ax = axs[4], color = "red" ,markersize =2)
school = ox.geometries.geometries_from_place('The Hague, NL', tags = {"amenity":"school"} )
school.to_crs(epsg=3857).plot(ax = axs[5], color = "red" ,markersize = 2)
axs[1].text(0.1, 0.5, '*Red dots shows corresponding amenities', horizontalalignment='center', verticalalignment='center', color = "red", transform=axs[1].transAxes)

#plot basemap
for i in [0,2,3,4,5]:
    ctx.add_basemap(ax=axs[i], url=ctx.providers.CartoDB.Positron)

axs[1].set_axis_off() #Remove row 1 right diagram.


w_queen = weights.Queen.from_dataframe(ds_clean_complete, idVariable='neighb_cbs', geom_col = "geometry")
w_queen.transform = 'R' #split weight so that sum = 1

w_rook = weights.Rook.from_dataframe(ds_clean_complete, idVariable='neighb_cbs', geom_col = "geometry")
w_rook.transform = 'R' #split weight so that sum = 1

w_knn = weights.KNN.from_dataframe(ds_clean_complete, ids = 'neighb_cbs', geom_col = "geometry", k=4)
w_knn.transform = 'R' #split weight so that sum = 1


#Testing the Spatial Weights
pd.DataFrame(w_knn).head()


fig,axs = plt.subplots(2,3, figsize=(10,5),facecolor='w',
                        constrained_layout=True, 
                        sharex=True, sharey=True, 
                        subplot_kw=dict(aspect='equal'))
axs = axs.ravel()


#Display Moran Scatter for each Variable
var_list = ["Average home prices", "Dist to GP", "Dist to Supermarket", "Dist to Daycare", "Dist to School"]
i = 0 
for var in var_list:
    mi = esda.Moran(ds_clean_complete[var], w_knn)
    moran_scatterplot(mi, ax= axs[i]);
    axs[i].set_title(var + ", (Moran I=" + str( "{:.2f})".format(mi.I)) )
    i = i+1
    
axs[5].set_axis_off() #Remove row 2 right diagram.


var_list = ["Average home prices", "Dist to GP", "Dist to Supermarket", "Dist to Daycare", "Dist to School"]

#compare_I = pd.DataFrame(columns = ["var","I"])

listI = [] 
for var in var_list:
    mi = esda.Moran(ds_clean_complete[var], w_knn)
    listI.append([var, mi.I])
    
pd.DataFrame(listI, columns = ['var', 'Moran I'])


def createLISA(df, col):
    """
    Arguments
    ------------
        dataframe: DataFrame
        col: Column
    Returns
    ------------
        lisa: Lisa Object
        ds: Updated DataFrame
    """
    ds_lisa = df.copy()
    lisa = esda.Moran_Local(ds_lisa['Average home prices'], w_knn)
    # Break observations into significant or not
    ds_lisa['significant'] = lisa.p_sim < 0.05
    # Store the quadrant they belong to
    ds_lisa['quadrant'] = lisa.q
    return lisa, ds_lisa


lisa, ds_lisa = createLISA(ds_clean_complete, "Average home prices")

#lisa_cluster(lisa, ds_lisa);
plot_local_autocorrelation(lisa, ds_lisa, 'Average home prices');


# Setup the figure and axis
f, ax = plt.subplots(1, figsize=(8, 5))

# plot Empty Layer
ds_original.to_crs(epsg=3857).plot(ax=ax, color='white', edgecolor='white', linewidth=0.5)
# Plot insignificant clusters
ns = ds_lisa.loc[ds_lisa['significant']==False, 'geometry']
ns.to_crs(epsg=3857).plot(ax=ax, color='grey')
# Plot HH clusters
hh = ds_lisa.loc[(ds_lisa['quadrant']==1) & (ds_lisa['significant']==True), 'geometry']
hh.to_crs(epsg=3857).plot(ax=ax, color='red')
# Plot LL clusters
ll = ds_lisa.loc[(ds_lisa['quadrant']==3) & (ds_lisa['significant']==True), 'geometry']
ll.to_crs(epsg=3857).plot(ax=ax, color='blue')
## Plot LH clusters
#lh = ds_lisa.loc[(ds_lisa['quadrant']==2) & (ds_lisa['significant']==True), 'geometry']
#lh.to_crs(epsg=3857).plot(ax=ax, color='#83cef4')
# Plot HL clusters
hl = ds_lisa.loc[(ds_lisa['quadrant']==4) & (ds_lisa['significant']==True), 'geometry']
hl.to_crs(epsg=3857).plot(ax=ax, color='#e59696')
# Add Base Map
ctx.add_basemap(ax=ax, url=ctx.providers.CartoDB.Positron)

# Style and draw
f.suptitle('LISA for Average Home Price', size=20)
f.set_facecolor('0.75')
ax.set_axis_off()

red_patch = mpatches.Patch(color='red', label='H-H')
blue_patch = mpatches.Patch(color='blue', label='L-L')
orange_patch = mpatches.Patch(color='#e59696', label='H-L')
grey_patch = mpatches.Patch(color='grey', label='Not Sig')
white_patch = mpatches.Patch(color='white', label='Missing')

plt.legend(handles=[red_patch, orange_patch, blue_patch, grey_patch, white_patch])


plt.show()


sns.pairplot(data= ds_clean[["Average home prices", "Dist to GP", "Dist to Supermarket", "Dist to Daycare", "Dist to School"]], kind="reg", plot_kws={'line_kws':{'color':'red'}});


f, ax = plt.subplots(1, 1, figsize = (6,5));
ax.set_title("Correlation Matrix of Variables");
sns.heatmap(ds_clean[["Average home prices", "Dist to GP", "Dist to Supermarket", "Dist to Daycare", "Dist to School"]].corr(), annot = True, cmap='flare', ax = ax);


# the independent variables set
X = ds_clean_complete[["Average home prices", "Dist to GP", "Dist to Supermarket", "Dist to Daycare", "Dist to School"]]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
  
vif_data


# List of Variables
variables = ["Average home prices","Dist to GP","Dist to Supermarket","Dist to Daycare","Dist to School"]
# Create a copy of the orginal dataset (entirely complete no NaN)


def createKmeans(df, variables, k):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
        k: K parameter for K-means
    Returns
    ------------
        kmeans: Cluster Object
        df: Update Data Frame
        sse: Error Values
    """
    # Fix seed
    np.random.seed(6969)
    
    # Set and Run the clustering algorithm
    kmeans = cluster.KMeans(n_clusters = k) #number of clusters
    kcls = kmeans.fit(df[variables]) 
    SSE = kmeans.inertia_
    
    # Place Label in Dataframe
    df['kcls'] = kcls.labels_ 
    
    return kmeans, df, SSE


def checkoptimalK(df, variables, kmax, xline):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
        kmax: Int; Maximum K Value
        xline: Int; Adding an X intercept
    """
    SSE_value = []
    k = []
    for i in range (2,kmax+1):
        result = createKmeans(df, variables, k=i)
        k.append(i)
        SSE_value.append(result[2]) #3rd element is the inertia
    #plotting
    plt.plot(k,SSE_value)
    plt.axvline(x=xline, linewidth=1, linestyle='dashed', color='k')
    plt.xlabel('Values of K') 
    plt.ylabel('Sum of squared distances/Inertia') 
    plt.title('Elbow Method For Optimal k')


def createDBSCAN(df, variables, eps, min_s):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
        eps: int; episelon parameter
        min_s: minimum samples
    Returns
    ------------
        dbs: Cluster Object
        df: Update Data Frame
    """
    # Fix seed
    np.random.seed(6969)
    
    # Set and Run the clustering algorithm
    dbs = DBSCAN(eps=eps, min_samples=min_s) #number of clusters
    dbcls = dbs.fit(df[variables])
    
    # Place Label in Dataframe
    df['dbcls'] = dbcls.labels_ 
    
    return dbs, df


def checkoptimalE(df, variables, k, yline):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
        k: Int; K-value
        yline: Int; Adding an Y intercept
    """
    nbrs = NearestNeighbors(n_neighbors=k).fit(df[variables])
    # Find the k-neighbors of a point
    neigh_dist, neigh_ind = nbrs.kneighbors(df[variables])
    sort_neigh_dist = np.sort(neigh_dist, axis=0)
    k_dist = sort_neigh_dist[:, 4]
    plt.plot(k_dist)
    plt.axhline(y=yline, linewidth=1, linestyle='dashed', color='k')
    plt.ylabel("k-NN distance")
    plt.xlabel("Sorted observations (4th NN)")
    plt.title('Knee Method For Optimal e')
    plt.show()


def createMeanShift(df, variables):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
    Returns
    ------------
        mscls: Cluster Object
        df: Update Data Frame
    """
    # Fix seed
    np.random.seed(6969)
    
    # Set and Run the clustering algorithm  
    bandwidth = estimate_bandwidth(df[variables], quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    mscls = ms.fit(df[variables])
    
    # Place Label in Dataframe
    df['mscls'] = mscls.labels_ 
    
    return mscls, df


def createReg(df, variables, n):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
        n: Int; Number of Clusters
    Returns
    ------------
        reg: Cluster Object
        df: Update Data Frame
    """
    # Fix seed
    np.random.seed(6969) 
    
    reg = cluster.AgglomerativeClustering(n_clusters=n, connectivity= weights.Queen.from_dataframe(df).sparse)
    regcls = reg.fit(df[variables])
    
    # Place Label in Dataframe
    df['regcls'] = reg.labels_ 
    
    return reg, df


def facet_cluster_plot(df, variables, cl_name):
    """
    Arguments
    ------------
        df: Dataframe
        variables: List of Variables Names
        cl_name: Column to Plot
    Returns
    ------------
        _: Facet plot
    """
    
    # Preparing the data to plot
    to_plot = df.set_index(cl_name) # Name (index) the rows after the category they belong
    to_plot = to_plot[variables] # Subset to keep only variables used in K-means clustering
    to_plot = (to_plot.stack().reset_index() # Melt table and rename
               .rename(columns={'level_1': 'Variable', 0: 'Values'}))
    # Setup the facets
    facets = sns.FacetGrid(data=to_plot, row='Variable', hue=cl_name,  \
                      sharey=False, sharex=False, aspect=2)
    # Build the plot as a `sns.kdeplot`
    _ = facets.map(sns.kdeplot, 'Values', shade=True).add_legend()
    return _


checkoptimalK(ds_clean_complete, variables, 10, xline = 5)


checkoptimalE(ds_clean_complete, variables, k=5, yline=16)


km_model, ds_model, sse = createKmeans(ds_clean_complete, variables, k=5)
db_model, ds_model = createDBSCAN(ds_model, variables, eps=16, min_s=3)
#ms_model, ds_model = createMeanShift(ds_model, variables) # Mean shift can be used, but it might be too similar to K-means
re_model, ds_model = createReg(ds_model, variables, n=7)


fig, axs = plt.subplots(1,3, figsize=(18,4))
axs = axs.ravel()

#plot empty areas
for i in [0,1,2]:
    ds_original.to_crs(epsg=3857).plot(color='gainsboro',ax=axs[i])  
    
plotChoro(ds_model.to_crs(epsg=3857), "kcls", 'K-Means Clusters', axs[0], cat = True, cmap = "tab10");
plotChoro(ds_model.to_crs(epsg=3857), "dbcls", 'DBSCAN Clusters', axs[1], cat = True, cmap = "tab10");
plotChoro(ds_model.to_crs(epsg=3857), "regcls", 'Regionalization Clusters', axs[2], cat = True, cmap = "tab10");

#plot basemap
for i in [0,1,2]:
    ctx.add_basemap(ax=axs[i], url=ctx.providers.CartoDB.Positron)

fig.suptitle('Choropleths of Dataset with different Clustering Methods', fontsize=18, y=1.05);


%%capture --no-display

facets = []
# Setup the facets
for var in variables:
    cls_df = ds_model[[var, "kcls", "dbcls", "regcls"]]
    cls_df= cls_df.melt(var).rename(columns = {'value': 'cluster'}).rename(columns = {'variable': 'method'})
    facets.append(sns.FacetGrid(data=cls_df, col='method', hue="cluster", palette = "tab10", \
                      sharey=False, sharex=True, aspect=1.2) );

#Adjust Titles
facets[0].fig.subplots_adjust(top=1);
facets[0].fig.suptitle('Clustering Methods and KDEs of each Variable of Interest\n <Rows=Variables, Cols=Method>');
    
#Build the plot as a `sns.kdeplot`
for facet, var in zip(facets, variables):
    _ = facet.map(sns.kdeplot, var, shade=True).add_legend();


#order cluster names based on "average home prices"
ds_model['kcls'] = (ds_model['kcls'].replace([4,1], [1,4]))


#extract cluster sizes by fraction of total
ksizes = ds_model.groupby('kcls').size()
ksizes = ksizes/len(ds_model)

#Translate them to percentiles
percentile = []
for i in range(1,len(ksizes)+1):
    percentile.append(sum(ksizes[0:i]))
np.array(percentile)

array([0.40186916, 0.6728972 , 0.85046729, 0.91588785, 1.        ])


# Rename Labels
ds_model['percentile'] = ds_model["kcls"]
p_name = ["(0) 0-40%", "(1) 40-67%", "(2) 67-85%","(3) 85-91%","(4) 91-100%"]
for i in range(len(p_name)):
    ds_model['percentile'] = ds_model['percentile'].replace(i,p_name[i])


%%capture --no-display
_ = facet_cluster_plot(ds_model.sort_values("percentile"), variables, 'percentile')

# Adjustments of Titles
_.fig.subplots_adjust(top=0.93);
_.fig.suptitle('KDE plots of K-Means Clusters')

Text(0.5, 0.98, 'KDE plots of K-Means Clusters')


mean_clusters = ds_model.groupby('kcls')['Average home prices'].mean()
mean_clusters

kcls
0    120.093023
1    209.448276
2    298.578947
3    442.857143
4    590.666667
Name: Average home prices, dtype: float64


# Setup figure and an axis grid of one row and two columns
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
ax.axis('equal')

#Empty Blocks
ds_original.to_crs(epsg=3857).plot(color='gainsboro',ax=ax)
ds_model.to_crs(epsg =3857).plot(column = "percentile", categorical=True, legend = True, ax=ax, linewidth=0.5, facecolor='white', cmap = 'viridis')

# Add title
ax.set_title('K-Means Cluster (with percentile of House Prices)')

# Roads and Amenity
gp = ox.geometries.geometries_from_place('The Hague, NL', tags = {"highway":"tertiary"} )
gp.to_crs(epsg =3857).plot(ax = ax, color = "red" , linewidth= 0.7)
gp = ox.geometries.geometries_from_place('The Hague, NL', tags = {"highway":"secondary"} )
gp.to_crs(epsg =3857).plot(ax = ax, color = "red" , linewidth= 0.7)
gp = ox.geometries.geometries_from_place('The Hague, NL', tags = {"amenity":"childcare"} )
gp.to_crs(epsg =3857).plot(ax = ax, color = "red" ,markersize = 9)

# Legend + Basemap
ax.text(0.2, 0.8, '*Red dots shows Daycare\n *Red lines show Arterial Roads', horizontalalignment='center', verticalalignment='center', color = "red", transform=ax.transAxes)
ctx.add_basemap(ax=ax, url=ctx.providers.CartoDB.Positron)

plt.show()


# Reseting the index
ds_model.reset_index(inplace= True)


# Separating out the features
x = ds_model.loc[:, variables].values
# Separating out the target
y = ds_model.loc[:,['percentile']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

# Conduct PCA and place in a dataframe
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pc1','pc2','pc3','pc4','pc5'])


#create list of individual values
indiv_ratio = pca.explained_variance_ratio_

#create list of cummulative values
accum_ratio = []
for i in range(0,len(indiv_ratio)):
               accum_ratio.append(sum(indiv_ratio[0:i+1]))
indexs = [str(x) for x in range(1,6)]

#create distr df to plot
distr = pd.DataFrame({'individual explained variance': indiv_ratio,
     'cummulative explained variance': accum_ratio})
distr


fig, ax = plt.subplots()
distr['individual explained variance'].plot(kind='bar', color='red',ax=ax)
distr['cummulative explained variance'].plot(kind='line', marker='*', color='black', ms=10, ax=ax)
plt.title("Choosing an appropriate number of PCs");
plt.xlabel('Principal Component');
plt.ylabel('Explained Variance Ratio');
plt.legend();
ax.set_xticks(distr.index, indexs); #change index from 0 to 1


finalDf = pd.concat([principalDf, ds_model[['percentile']]], axis = 1)
sns.scatterplot(data=finalDf.sort_values("percentile"), x='pc1', y='pc2', hue='percentile', style='percentile', palette="tab10").set(title='PC1 explains 61% of the Variance while PC2 explains 16%');


coef_df = pd.DataFrame(pca.components_, columns = variables)
coef_df.index = coef_df.index+1
coef_df.index.name = 'PC'
coef_df

	neighb_cbs	neigb_cijf	geometry
0	Oostduinen	70 Oostduinen	POLYGON ((4.30290 52.12832, 4.30298 52.12827, ...
1	Belgisch Park	71 Belgisch Park	POLYGON ((4.28056 52.11706, 4.28053 52.11706, ...
2	Westbroekpark	73 Westbroekpark	POLYGON ((4.29171 52.10745, 4.29181 52.10739, ...
3	Duttendel	74 Duttendel	POLYGON ((4.32252 52.10768, 4.32252 52.10766, ...
4	Nassaubuurt	48 Nassaubuurt	POLYGON ((4.31943 52.09247, 4.31943 52.09247, ...

	gwb_code_10	gwb_code_8	regio	gm_naam	recs	gwb_code	ind_wbi	a_inw	a_man	a_vrouw	...	a_opp_ha	a_lan_ha	a_wat_ha	pst_mvp	pst_dekp	ste_mvs	ste_oad	g_wodief	g_vernoo	g_gewsek
0	NL00	0	Nederland	Nederland	Land	NL00	.	17081507	8475102	8606405	...	4154302	3367996	786306	.	.	2	1969	.	.	.
1	GM0003	3	Appingedam	Appingedam	Gemeente	GM0003	.	11971	5802	6169	...	2458	2378	80	.	.	3	1045	2	3	6
2	WK000300	300	Wijk 00	Appingedam	Wijk	WK000300	1	11970	5800	6170	...	2458	2378	80	.	.	3	1045	2	3	6
3	BU00030000	30000	Appingedam-Centrum	Appingedam	Buurt	BU00030000	1	2335	1090	1245	...	90	84	5	9901	1	3	1190	1	6	9
4	BU00030001	30001	Appingedam-West	Appingedam	Buurt	BU00030001	1	3080	1535	1545	...	163	158	5	9903	5	4	894	2	2	4

	gwb_code_10	gwb_code_8	regio	gm_naam	recs	gwb_code	ind_wbi	a_inw	a_man	a_vrouw	...	a_opp_ha	a_lan_ha	a_wat_ha	pst_mvp	pst_dekp	ste_mvs	ste_oad	g_wodief	g_vernoo	g_gewsek
8042	BU05180170	5180170	Oostduinen	's-Gravenhage	Buurt	BU05180170	1	0	0	0	...	318	308	10	2597	4	4	681	NaN	NaN	NaN
8044	BU05180271	5180271	Belgisch Park	's-Gravenhage	Buurt	BU05180271	1	8145	4025	4120	...	102	102	0	2587	2	2	2438	3	7	9
8046	BU05180373	5180373	Westbroekpark	's-Gravenhage	Buurt	BU05180373	1	845	375	465	...	46	41	5	2597	3	2	2075	7	12	12
8047	BU05180374	5180374	Duttendel	's-Gravenhage	Buurt	BU05180374	1	1050	485	565	...	133	130	3	2597	1	3	1103	10	6	6
8049	BU05180448	5180448	Nassaubuurt	's-Gravenhage	Buurt	BU05180448	1	1595	765	830	...	29	28	0	2596	1	1	3212	3	4	6

	neighb_cbs	neigb_cijf	geometry	gwb_code_10	gwb_code_8	gm_naam	recs	gwb_code	ind_wbi	a_inw	...	a_opp_ha	a_lan_ha	a_wat_ha	pst_mvp	pst_dekp	ste_mvs	ste_oad	g_wodief	g_vernoo	g_gewsek
0	Oostduinen	70 Oostduinen	POLYGON ((4.30290 52.12832, 4.30298 52.12827, ...	BU05180170	5180170	's-Gravenhage	Buurt	BU05180170	1	0	...	318	308	10	2597	4	4	681	NaN	NaN	NaN
1	Belgisch Park	71 Belgisch Park	POLYGON ((4.28056 52.11706, 4.28053 52.11706, ...	BU05180271	5180271	's-Gravenhage	Buurt	BU05180271	1	8145	...	102	102	0	2587	2	2	2438	3	7	9
2	Westbroekpark	73 Westbroekpark	POLYGON ((4.29171 52.10745, 4.29181 52.10739, ...	BU05180373	5180373	's-Gravenhage	Buurt	BU05180373	1	845	...	46	41	5	2597	3	2	2075	7	12	12
3	Duttendel	74 Duttendel	POLYGON ((4.32252 52.10768, 4.32252 52.10766, ...	BU05180374	5180374	's-Gravenhage	Buurt	BU05180374	1	1050	...	133	130	3	2597	1	3	1103	10	6	6
4	Nassaubuurt	48 Nassaubuurt	POLYGON ((4.31943 52.09247, 4.31943 52.09247, ...	BU05180448	5180448	's-Gravenhage	Buurt	BU05180448	1	1595	...	29	28	0	2596	1	1	3212	3	4	6

	neighb_cbs	geometry	Average home prices	Dist to GP	Dist to Supermarket	Dist to Daycare	Dist to School
1	Belgisch Park	POLYGON ((4.28056 52.11706, 4.28053 52.11706, ...	312.0	0.8	0.5	0.4	0.5
2	Westbroekpark	POLYGON ((4.29171 52.10745, 4.29181 52.10739, ...	581.0	0.8	1.3	0.9	0.8
3	Duttendel	POLYGON ((4.32252 52.10768, 4.32252 52.10766, ...	615.0	1.4	1.5	1.1	0.4
4	Nassaubuurt	POLYGON ((4.31943 52.09247, 4.31943 52.09247, ...	495.0	0.3	0.3	0.2	0.5
5	Uilennest	POLYGON ((4.34232 52.09748, 4.34233 52.09747, ...	270.0	1.2	0.3	1.1	1.0

	variable	count
61	g_afs_kv	110
62	g_afs_gs	110
63	g_afs_hp	110
65	g_afs_sc	110
89	g_woz	107

	neighb_cbs	geometry	Average home prices	Dist to GP	Dist to Supermarket	Dist to Daycare	Dist to School	No. of inhabitants
0	Oostduinen	POLYGON ((4.30290 52.12832, 4.30298 52.12827, ...	NaN	NaN	NaN	NaN	NaN	0
1	Kerketuinen en Zichtenburg	POLYGON ((4.26160 52.05135, 4.26142 52.05125, ...	NaN	0.7	0.8	0.7	0.5	45
2	Vliegeniersbuurt	POLYGON ((4.34913 52.04689, 4.34942 52.04690, ...	NaN	NaN	NaN	NaN	NaN	0
3	De Reef	POLYGON ((4.37473 52.06138, 4.37483 52.06135, ...	NaN	NaN	NaN	NaN	NaN	0
4	Tedingerbuurt	POLYGON ((4.37090 52.05809, 4.37371 52.05609, ...	NaN	NaN	NaN	NaN	NaN	0
5	Vlietzoom-Oost	POLYGON ((4.38436 52.07616, 4.38458 52.07623, ...	NaN	0.8	1.2	0.6	0.4	120
6	De Rivieren	POLYGON ((4.39709 52.07476, 4.39778 52.07496, ...	NaN	1.2	1.6	0.6	0.9	25

	0	1
0	Belgisch Park	{'Rijslag': 0.25, 'Westbroekpark': 0.25, 'Sche...
1	Westbroekpark	{'Van Stolkpark en Scheveningse Bosjes': 0.25,...
2	Duttendel	{'Waalsdorp': 0.25, 'Westbroekpark': 0.25, 'Ar...
3	Nassaubuurt	{'Arendsdorp': 0.25, 'Van Hoytemastraat en omg...
4	Uilennest	{'Haagse Bos': 0.25, 'Van Hoytemastraat en omg...

	var	Moran I
0	Average home prices	0.486430
1	Dist to GP	0.285289
2	Dist to Supermarket	0.440080
3	Dist to Daycare	0.092178
4	Dist to School	0.155680

	feature	VIF
0	Average home prices	5.179058
1	Dist to GP	10.434053
2	Dist to Supermarket	8.761976
3	Dist to Daycare	9.889129
4	Dist to School	7.272249

	individual explained variance	cummulative explained variance
0	0.612783	0.612783
1	0.160471	0.773255
2	0.096538	0.869792
3	0.076278	0.946070
4	0.053930	1.000000

	Average home prices	Dist to GP	Dist to Supermarket	Dist to Daycare	Dist to School
PC
1	0.368311	0.494286	0.472020	0.464550	0.425933
2	-0.777880	0.214790	-0.294262	0.322642	0.397594
3	0.131274	-0.311229	-0.022218	-0.488001	0.804528
4	-0.475730	0.148016	0.742165	-0.434971	-0.108460
5	0.125292	0.768619	-0.373236	-0.502743	-0.038362

EPA-1316 Introduction to Urban Data Science

Assignment 3: Prediction / Inference¶

0. Importing Libraries¶

1. Importing Data¶

1a. Loading Dataset 1 (Neighborhoods of The Hague)¶

1b. Loading Dataset 2 (Neighbourhood Data)¶

Quick Appraisal¶

1b cont. Dataset 2 Cleaning¶

1c. Merge Dataset 1 and Dataset 2¶

2. Crafting Hypothesis¶

Hypothesis¶

Checking Suitability of Variables¶

3. Data Cleaning¶

3a Filter by Variables of Choice¶

3b Setting Datatype for the Variables which should be numerical values¶

3c Renaming¶

3d Managing Missing Data¶

Deleting Inhabited Neighbourhoods¶

4. Data Exploration¶

4a Basic Statistics (Shape + Distribution)¶

Observing the distribution of Average home prices¶

Observing the Distribution of Proximity Data (compare between amenities)¶

4b Understanding the Variables spatially¶

4b1 Spatial Distributions¶

4b2 Global and Local Autocorrelation¶

4b2a Spatial Weights¶

4b2b Plotting Global Autocorrelation (Moran Plot)¶

4b2c Plotting Local Autocorrelation (LISA)¶

4c Understanding the relationship between variables (non spatially)¶

Pairplot (Scatters)¶

Correlation Matrix/ Heat Map¶

Detecting Multicollinearity with Variance Inflation Factor(VIF)¶

5. Analysis Approach/Planning¶

6. Modelling (Clustering Model)¶

6a. Essential Imports and Functions¶

6b. Execute Clustering Models¶

6b1 Tuning parameters for cluster models¶

6b2 Intialize Models and Plot Clusters (Choropleths, KDE plots)¶

Plot Chloropleth of Different Cluster Methods¶

Plotting Clusters with Distribution of Average Home Prices¶

6b3 Selecting a Clustering model (K-means)¶

Plotting K-Means Facet Grid¶

Plotting Choropleth of Clusters with Streets and Amenities¶

7. Exploring Variables through PCA¶

Conducting PCA to show the variables that make up the Clusters¶

Showing Distribution of Explained Variance¶

Analyzing the Coefficents in PCs of Clusters¶

8. Discussion¶

`0. Importing Libraries`¶

`1. Importing Data`¶

`2. Crafting Hypothesis`¶

`3. Data Cleaning`¶

`4. Data Exploration`¶

`5. Analysis Approach/Planning`¶

`6. Modelling (Clustering Model)`¶

`7. Exploring Variables through PCA`¶

`8. Discussion`¶