import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import cm

from sklearn import datasets
from sklearn.datasets import make_blobs

from mpl_toolkits.mplot3d import Axes3D
import scipy.ndimage

pd.set_option("display.max_rows", None)

6.1 Motivación: visualizar datos multidimensionales

Clasificación y clustering

Datos bidimensionales con etiquetas

X, y = make_blobs(n_samples=100, centers=3, n_features=2,random_state=0)
#np.unique(y)
df1 = pd.DataFrame(X,columns=['x1','x2'])
df1['label'] = y
df1.head(10)
x1 x2 label
0 2.631858 0.689365 1
1 0.080804 4.690690 0
2 3.002519 0.742654 1
3 -0.637628 4.091047 0
4 -0.072283 2.883769 0
5 0.628358 4.460136 0
6 -2.674373 2.480062 2
7 -0.577483 3.005434 2
8 2.727562 1.305125 1
9 0.341948 3.941046 0
f,axes = plt.subplots(nrows=1,ncols=2,figsize=(10,5))
ax = axes.ravel()
ax[0].plot(X[:,0],X[:,1],'.')

colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
for yy in np.unique(y):
    dataSelection = y ==yy
    data = X[dataSelection,:]
    ax[1].plot(data[:,0],data[:,1],'.',c=colors[yy],label=yy)
ax[1].legend()
for a in ax:
    a.set_xlabel(r'$X_1$')
    a.set_ylabel(r'$X_2$')
    
_images/06.01_SOM_VisualizarDatos_3_0.png

Datos tridimensionales con etiquetas

X, y = make_blobs(n_samples=200, centers=4, n_features=3,random_state=4)
#np.unique(y)
df1 = pd.DataFrame(X,columns=['x1','x2','x3'])
df1['label'] = y
df1.head()
x1 x2 x3 label
0 9.863844 0.448826 9.282223 0
1 10.596115 -11.280679 -5.449909 2
2 -2.083092 7.565793 -5.662472 3
3 5.578687 5.149093 -6.176931 1
4 4.101746 3.373981 -6.774126 1
colorsDict = {0:'orange',
              1:'red', 
              2:'green', 
              3:'blue'}

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121, projection='3d')
for ii in df1.index:
    ax.scatter(
        df1.loc[ii,'x1'],
        df1.loc[ii,'x2'],
        df1.loc[ii,'x3'],
        marker='.',
        s=100,
        color='blue'
        #color = colorsDict[
        #    df1.loc[ii,'label']
        #]
    )
ax = fig.add_subplot(122, projection='3d')
for ii in df1.index:
    ax.scatter(
        df1.loc[ii,'x1'],
        df1.loc[ii,'x2'],
        df1.loc[ii,'x3'],
        marker='.',
        s=100,
        color = colorsDict[
            df1.loc[ii,'label']
        ]
    )
plt.show()
_images/06.01_SOM_VisualizarDatos_6_0.png

Regresión

x = np.linspace(-10,10,100)
y = 4*np.sin(x)*np.cos(2*x)+np.sin(3*x)/3+11/6*np.cos(5*x)
f,ax = plt.subplots(nrows=1,ncols=1,figsize=(7,7))
ax.plot(x,y,'-',color='red')
ydata = y+np.random.uniform(-0.5,0.5,y.shape[0])
ax.plot(x,ydata,'.',color='blue',label='datos')
ax.legend()
<matplotlib.legend.Legend at 0x7f95704e5160>
_images/06.01_SOM_VisualizarDatos_8_1.png
fig, ax = plt.subplots(figsize=(10,10),subplot_kw={"projection": "3d"})
# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
#R = np.sqrt(X**2 + Y**2)
#Z = np.sin(R)
Z=np.sin(X/3)*np.cos(Y/2)+2
Z2 = np.sin(X/3)*np.cos(Y/2)+2+np.random.uniform(-0.5,0.5,(Z.shape[0],Z.shape[1]))

# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
ax.scatter(X,Y,Z2,s=2,label='datos')
ax.legend()
plt.show()
_images/06.01_SOM_VisualizarDatos_9_0.png

Más de tres dimensiones

Cuando tenemos más dimensiones no podemos visualizar los datos.

Datos iris

Usamos la scatter_matrix

# iris
iris = datasets.load_iris()
iris_data = iris.data
iris_data.shape
irisDf = pd.DataFrame(iris_data,columns=iris.feature_names)
irisDf['target'] = iris.target
irisDf.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
df = pd.DataFrame(iris.data, columns=iris.feature_names)
colors=np.array(50*['r']+50*['g']+50*['b'])
pd.plotting.scatter_matrix(df, 
                           alpha=0.6, 
                           figsize=(10,10), 
                           #color=colors,
                           hist_kwds={'bins':30})
plt.show()
_images/06.01_SOM_VisualizarDatos_12_0.png
df = pd.DataFrame(iris.data, columns=iris.feature_names)
colors=np.array(50*['r']+50*['g']+50*['b'])
pd.plotting.scatter_matrix(df, 
                           alpha=0.6, 
                           figsize=(10,10), 
                           color=colors,
                           hist_kwds={'bins':30})
plt.show()
_images/06.01_SOM_VisualizarDatos_13_0.png
import seaborn as sns
sns.set_theme(style="ticks")

df = sns.load_dataset("penguins")
df.head(10)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 Male
6 Adelie Torgersen 38.9 17.8 181.0 3625.0 Female
7 Adelie Torgersen 39.2 19.6 195.0 4675.0 Male
8 Adelie Torgersen 34.1 18.1 193.0 3475.0 NaN
9 Adelie Torgersen 42.0 20.2 190.0 4250.0 NaN

Método pairplot del paquete seaborn

https://seaborn.pydata.org/generated/seaborn.pairplot.html

sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7f95705185b0>
_images/06.01_SOM_VisualizarDatos_16_1.png
sns.pairplot(df, hue="species")
<seaborn.axisgrid.PairGrid at 0x7f9570028670>
_images/06.01_SOM_VisualizarDatos_17_1.png

Puede haber datos sin etiquetas

import numpy as np
import pandas as pd

np.random.seed(134)                     
N = 1000                              
 
x1 = np.random.normal(0, 1, N)                        
x2 = x1 + np.random.normal(0, 3, N)              
x3 = 2 * x1 - x2 +  np.random.normal(0, 2, N)
x4 = x3 * x1 -4*x2 + np.random.normal(0, 1, N)
x5 = x2-x4*x1 + np.random.normal(0, 2, N)

df = pd.DataFrame({'x1':x1,
                   'x2':x2,
                   'x3':x3,
                   'x4':x4,
                   'x5':x5
                  })

df.head()
x1 x2 x3 x4 x5
0 -0.224315 -8.840152 10.145993 33.286302 -1.376902
1 1.337257 2.383882 -1.854636 -11.590022 18.471552
2 0.882366 3.544989 -1.117054 -14.303068 14.009670
3 0.295153 -3.844863 3.634823 15.538617 -4.391063
4 0.780587 -0.465342 2.121288 2.874332 1.209348
import matplotlib.pyplot as plt
pd.plotting.scatter_matrix(df,figsize=(10,10))
plt.show()
_images/06.01_SOM_VisualizarDatos_20_0.png

Proyecciones

Las gráficas anteriores son proyecciones ortogonales de los datos sobre los diferentes planos formados eligiendo coordenadas de los diferentes atributos, por parejas

Ejemplo ad hoc en tres dimensiones

clusters=[1,2,3,4,5,6]
Npuntos = 40*len(clusters)

dictClusters={1:(1,0,0),
              2:(0,1,0),
              3:(0,0,1),
             4:(-1,0,0),
             5:(0,-1,0),
             6:(0,0,-1)}

dictDesvests={1:0.15,
              2:0.15,
              3:0.15,
             4:0.15,
             5:0.15,
             6:0.15}


label = []
df = pd.DataFrame(columns = ['x1','x2','x3'])
for _ in range(Npuntos):
    cluN = np.random.choice(clusters)
    clu=dictClusters[cluN]
    clx,cly,clz = clu
    desvest = dictDesvests[cluN]
    clx += np.random.normal(0,desvest)
    cly += np.random.normal(0,desvest)
    clz += np.random.normal(0,desvest)
    label.append(cluN)
    df.loc[df.shape[0]] = [clx,cly,clz]
df['label'] = label
df.head()
x1 x2 x3 label
0 -1.017333 0.098265 0.038147 4
1 -1.266728 -0.147091 0.011565 4
2 -1.010595 -0.059136 -0.128969 4
3 0.018001 -1.076903 0.076536 5
4 -0.088302 -0.908741 -0.319811 5
colorsDict = {6:'orange',
              1:'red', 
              2:'green', 
              3:'blue',
             4:'black',
             5:'purple'}

#maxLabel = np.max(df.label.unique())

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')


u = np.linspace(0, 2 * np.pi, 39)
v = np.linspace(0,np.pi, 21)

x = np.outer(np.cos(u), np.sin(v))
y = np.outer(np.sin(u), np.sin(v))
z = np.outer(np.ones(np.size(u)), np.cos(v))

## Use 3x the stride, no scipy zoom
#ax = fig.gca(projection='3d')
##ax.plot_surface(x, y, z, rstride=3, cstride=3, color='black', shade=0)
#ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#plt.show()

# Normalize to [0,1]
norm = plt.Normalize(z.min(), z.max())
colors = cm.viridis(norm(z))
rcount, ccount, _ = colors.shape

#fig = plt.figure()
ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#surf = ax.plot_surface(x, y, z, rcount=rcount, ccount=ccount,
#                       facecolors=colors, shade=False)



for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        df.loc[ii,'x2'],
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

ax.set_xlim((-1.5,1.5))
ax.set_ylim((-1.5,1.5))
ax.set_zlim((-1.5,1.5))

plt.show()
_images/06.01_SOM_VisualizarDatos_24_0.png
sns.pairplot(df, hue="label")
<seaborn.axisgrid.PairGrid at 0x7f957007ebb0>
_images/06.01_SOM_VisualizarDatos_25_1.png
colorsDict = {6:'orange',
              1:'red', 
              2:'green', 
              3:'blue',
             4:'black',
             5:'purple'}

#maxLabel = np.max(df.label.unique())

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')


u = np.linspace(0, 2 * np.pi, 39)
v = np.linspace(0,np.pi, 21)

x = np.outer(np.cos(u), np.sin(v))
y = np.outer(np.sin(u), np.sin(v))
z = np.outer(np.ones(np.size(u)), np.cos(v))

## Use 3x the stride, no scipy zoom
#ax = fig.gca(projection='3d')
##ax.plot_surface(x, y, z, rstride=3, cstride=3, color='black', shade=0)
#ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#plt.show()

# Normalize to [0,1]
norm = plt.Normalize(z.min(), z.max())
colors = cm.viridis(norm(z))
rcount, ccount, _ = colors.shape

#fig = plt.figure()
ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#surf = ax.plot_surface(x, y, z, rcount=rcount, ccount=ccount,
#                       facecolors=colors, shade=False)



for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        df.loc[ii,'x2'],
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )
    
# proyeccion 1
for ii in df.index:
    ax.scatter(
        -3,
        df.loc[ii,'x2'],
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

# proyeccion 2
for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        3,
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

# proyeccion 2
for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        df.loc[ii,'x2'],
        -3,
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

plt.show()
_images/06.01_SOM_VisualizarDatos_26_0.png

Ninguna proyección es completamente satisfactoria: problema con la distancia.

Nota: estas no son las únicas proyecciones posibles.Existen más proyecciones posibles

Ejemplo ad hoc en el plano

c1 = np.array((1,1))
c2 = np.array((2,1))
c3 = np.array((1,2))
c4 = np.array((2,2))
centers = [c1,c2,c3,c4]
Npuntos = 20
df = pd.DataFrame(columns=['x1','x2'])
labels=[]
for lb,c in enumerate(centers):
    for _ in range(Npuntos):
        df.loc[df.shape[0]] = c+np.random.uniform(-0.25,0.25,2)
        labels.append(lb)
df['label'] = labels
df.head()
x1 x2 label
0 1.056981 0.935569 0
1 0.781720 0.935267 0
2 0.810779 0.867002 0
3 0.906029 1.030259 0
4 1.103518 1.027526 0
sns.pairplot(df, hue="label")
<seaborn.axisgrid.PairGrid at 0x7f956d4e38b0>
_images/06.01_SOM_VisualizarDatos_29_1.png
# proyecciones
p1  = []
vR1 = np.array((1,1))
iR1 = np.inner(vR1,vR1)

p2 = []
vR2 = np.array((1,-1))
iR2 = np.inner(vR2,vR2)

for ii in df.index:
    v = np.array((df.loc[ii,'x1'],df.loc[ii,'x2']))
    p1.append(np.inner(v,vR1)/iR1)
    p2.append(np.inner(v,vR2)/iR2)

df['p1']  = p1
df['p2']  = p2
df.head()
x1 x2 label p1 p2
0 1.056981 0.935569 0 0.996275 0.060706
1 0.781720 0.935267 0 0.858494 -0.076773
2 0.810779 0.867002 0 0.838890 -0.028112
3 0.906029 1.030259 0 0.968144 -0.062115
4 1.103518 1.027526 0 1.065522 0.037996
colorsDict = {0:'orange',
              1:'red', 
              2:'green', 
              3:'blue'}
colores = [colorsDict[df.label.loc[i]] for i in df.index]
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,10))
ax[0].scatter(df.x1,df.x2,c=colores,alpha=0.5)
ax[0].scatter(np.zeros(df.shape[0]),df.x2,marker='<',c=colores,alpha=0.5)
ax[0].scatter(df.x1,np.zeros(df.shape[0]),marker='v',c=colores,alpha=0.5)
ax[0].set_xlim((-0.5,3))
ax[0].set_ylim((-0.5,3))
ax[0].grid()

ax[1].scatter(df.x1,df.x2,c=colores,alpha=0.5)
ax[1].plot(np.linspace(-0,3,100),np.linspace(-0,3,100))
ax[1].plot(np.linspace(-3,3,100),-np.linspace(-3,3,100))
ax[1].scatter(df.p1,df.p1,marker = '+',c=colores,alpha=0.5)
ax[1].scatter(df.p2,-df.p2,marker = '+',c=colores,alpha=0.5)
ax[1].set_xlim((-1,3))
ax[1].set_ylim((-1,3))
ax[1].grid()
_images/06.01_SOM_VisualizarDatos_31_0.png

Nota: existen direcciones para las cuales las proyecciones son más adecuadas

Mapa

fishy

Fig. 26 Mapa de Madrid

Un mapa es una proyección cartográfica en la que puntos geográficos cercanos están cerca en el mapa.