# sklearn降維方法舉例

### 以datasets.digits資料為例匯入相關包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.datasets import load_digits

#### 匯入資料,0,1,2,3,4,5,6這7個數字的手寫影象

digits = load_digits(n_class=7)#integer
X = digits.data
y = digits.target
n_samples,n_features = X.shape

#### 輸出影象

from matplotlib import offsetbox
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)#正則化
print X
plt.figure(figsize=(10, 10))
ax = plt.subplot(111)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 12})
#列印彩色字型
if hasattr(offsetbox, 'AnnotationBbox'):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]])  # just something big
for i in range(digits.data.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
# don't show points that are too close
continue
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
X[i])
plt.xticks([]), plt.yticks([])
if title is not None:
plt.title(title)


### Johnson-Lindenstrauss Lemma

#### 假設我們有資料x∈Rn” role=”presentation”>x∈Rnx∈Rnx \in R^n， 而我們通過一種方法f(x)” role=”presentation”>f(x)f(x)f(x)將其降維成y∈Rk” role=”presentation”>y∈Rky∈Rky \in R^k， 那麼， 將為前後任意兩點a,b” role=”presentation”>a,ba,ba,b之間的距離有不等式保證：

(1−ϵ)‖a−b‖2≤‖f(a)−f(b)‖2≤(1 ϵ)‖a−b‖2″ role=”presentation”>(1−ϵ)∥a−b∥2≤∥f(a)−f(b)∥2≤(1 ϵ)∥a−b∥2(1−ϵ)‖a−b‖2≤‖f(a)−f(b)‖2≤(1 ϵ)‖a−b‖2(1-\epsilon )\| a-b \| ^2 \le \| f(a)-f(b) \| ^2 \le (1 \epsilon ) \| a-b \|^2

### Random Projection

#### 實現：

from sklearn import random_projection
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
#隨機對映演算法
start_time = time.time()
X_projected = rp.fit_transform(X)#隨機投射
plot_embedding(X_projected, "Random Projection of the digits (time: %.3fs)" % (time.time() - start_time))
plt.show()

#### 官方描述：

def sparse_random_matrix(n_components, n_features, density='auto',
random_state=None):
"""Generalized Achlioptas random sparse matrix for random projection
Setting density to 1 / 3 will yield the original matrix by Dimitris
Achlioptas while setting a lower value will yield the generalization
by Ping Li et al.
If we note :math:s = 1 / density, the components of the random matrix are
drawn from:
- -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
-  0                              with probability 1 - 1 / s
-  sqrt(s) / sqrt(n_components)   with probability 1 / 2s
Read more in the :ref:User Guide <sparse_random_matrix>.
Parameters
----------
n_components : int,
Dimensionality of the target projection space.
n_features : int,
Dimensionality of the original source space.
density : float in range ]0, 1] or 'auto', optional (default='auto')
Ratio of non-zero component in the random projection matrix.
If density = 'auto', the value is set to the minimum density
as recommended by Ping Li et al.: 1 / sqrt(n_features).
Use density = 1 / 3.0 if you want to reproduce the results from
Achlioptas, 2001.
random_state : integer, RandomState instance or None (default=None)
Control the pseudo random number generator used to generate the
matrix at fit time.
Returns
-------
components: numpy array or CSR matrix with shape [n_components, n_features]
The generated Gaussian random matrix.
"""
_check_input_size(n_components, n_features)
density = _check_density(density, n_features)
rng = check_random_state(random_state)
if density == 1:
# skip index generation if totally dense
components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1
return 1 / np.sqrt(n_components) * components
else:
# Generate location of non zero elements
indices = []
offset = 0
indptr = [offset]
for i in xrange(n_components):
# find the indices of the non-zero components for row i
n_nonzero_i = rng.binomial(n_features, density)
indices_i = sample_without_replacement(n_features, n_nonzero_i,
random_state=rng)
indices.append(indices_i)
offset  = n_nonzero_i
indptr.append(offset)
indices = np.concatenate(indices)
# Among non zero components the probability of the sign is 50%/50%
data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1
# build the CSR structure by concatenating the rows
components = sp.csr_matrix((data, indices, indptr),
shape=(n_components, n_features))
return np.sqrt(1 / density) / np.sqrt(n_components) * components


### 2.TSVD(截斷奇異值分解, TruncatedSVD,PCA的一種實現)

#### 對於原始資料矩陣A(N*M) ，N代表樣本個數，M代表維度，對其進行SVD分解：

A=UΣVT,Σ=diag(δ1,δ2,…,δn)=[δ100..00δ20..000δ3..0…………000..δn]” role=”presentation”>A=UΣVT,Σ=diag(δ1,δ2,…,δn)=⎡⎣⎢⎢⎢⎢⎢⎢⎢⎢δ100..00δ20..000δ3..0…………000..δn⎤⎦⎥⎥⎥⎥⎥⎥⎥⎥A=UΣVT,Σ=diag(δ1,δ2,…,δn)=[δ100..00δ20..000δ3..0…………000..δn]

A = U\Sigma V^T, \Sigma=diag(\delta_1,\delta_2,…,\delta_n)=\left[
\begin{matrix}
\delta_1&0&0&.&.&0\\
0&\delta_2&0&.&.&0\\
0&0&\delta_3&.&.&0\\
.&.&.&.&.&.\\
.&.&.&.&.&.\\
0&0&0&.&.&\delta_n
\end{matrix}
\right]

#### 演示一下TSVD工作原理

In[1]:
import numpy as np
from scipy.linalg import svd
D = np.array([[1, 2], [1, 3], [1, 4]])
D
Out[1]:
array([[1, 2],
[1, 3],
[1, 4]])
In[2]:
U, S, V = svd(D, full_matrices=False)
#我們可以根據SVD的定義，用 UU ， SS 和 VV 還原矩陣 DD
U.shape, S.shape, V.shape
Out[2]:
((3, 2), (2, 1), (2, 2))
In[3]:
np.dot(U.dot(np.diag(S)), V)#還原
Out[3]:
array([[ 1.,  2.],
[ 1.,  3.],
[ 1.,  4.]])

#### TruncatedSVD返回的矩陣是 U 和 S 的點積。如果我們想模擬TSVD，我們就去掉最新奇異值和對於 U 的列向量。例如，我們想要一個主成份，可以這樣：

In[4]:
new_S = S[0]
new_U = U[:, 0]
new_U.dot(new_S)
Out[4]:
array([-2.20719466, -3.16170819, -4.11622173])

#### 實現：

from sklearn import decomposition
X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X)
start_time = time.time()
plot_embedding(X_pca,"Principal Components projection of the digits (time: %.3fs)" % (time.time() - start_time))
plt.show()

### 3.t-SNE(t-分佈鄰域嵌入演算法)

#### 流形學習方法(Manifold Learning)，簡稱流形學習,可以將流形學習方法分為線性的和非線性的兩種，線性的流形學習方法如我們熟知的主成份分析（PCA），非線性的流形學習方法如等距對映（Isomap）、拉普拉斯特徵對映（Laplacian eigenmaps，LE）、區域性線性嵌入(Locally-linear embedding，LLE)。

t-SNE詳細介紹：http://lvdmaaten.github.io/tsne/

from sklearn import manifold
#降維
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
start_time = time.time()
X_tsne = tsne.fit_transform(X)
#繪圖
plot_embedding(X_tsne,
"t-SNE embedding of the digits (time: %.3fs)" % (time.time() - start_time))
plt.show()
#這個非線性變換降維過後，僅僅2維的特徵，就可以將原始資料的不同類別，在平面上很好地劃分開。
#不過t-SNE也有它的缺點，一般說來，相對於線性變換的降維，它需要更多的計算時間。