with open(filename, encoding="utf8") as f: for line in f.readlines(): line = line.split() glove[line[0]] = list(map(lambda w: float(w), line[1:]))
# !注意words自己从百度上找,我是用的是四级高频词汇,一共700个! words = [...]
# 筛选存在GLOVE的词 selected_vecs = {word: glove[word] for word in words if word in glove.keys()} selected_words = list(selected_vecs.keys()) sigma = [np.var(selected_vecs[w]) for w in selected_words] n = len(selected_vecs)
# 计算高维向量分布矩阵 for i in tqdm(range(n)): dominator = 0.0# 计算分母 for k in range(n): if i == k: continue d = vec_distance(selected_words[i], selected_words[k]) dominator += np.exp(-d / (2 * sigma[i]**2)) # 计算p矩阵的项 for j in range(n): if i == j: continue d = vec_distance(selected_words[i], selected_words[j]) _p[i][j] = np.exp(-d / (2 * sigma[i]**2)) / dominator pass
p = (_p.T + _p[j][i]) / (2 * n)
# 训练低维空间距离 EPOCH = 100 DIM = 2 lr = 1e3 pre_kl = [] window = 5
y = np.random.randn(n, DIM) # 默认是服从N(0,1)分布
defKL(P, Q, epsilon=1e-10): _P = P + epsilon return (_P * np.log2(_P / (Q + epsilon))).sum()
for epoch in range(EPOCH): # 计算分母 dy = y.reshape(n, 1, DIM) - y.reshape(1, n, DIM) # n * n * 3 dominator = (dy**2).sum(axis=2) + 1# n * n * 1 dominator = 1 / dominator # n * n * 1 dominator_sum = dominator.sum() # 低维距离矩阵 q = dominator / dominator_sum
# 计算loss kl = KL(p, q) print(f"epoch {epoch + 1}/{EPOCH}\tKL = {kl:.6f}\tLR = {lr}") # 动态调整学习率 if len(pre_kl) > 0: kl_mean = sum(pre_kl) / len(pre_kl) if kl_mean > kl: lr *= 1.2 else: lr *= 0.5 pre_kl.append(kl) if len(pre_kl) > window: pre_kl.pop(0)
d_pq = p - q # 计算梯度 grad = np.zeros((n, DIM)) for i in range(n): _grad = 0 for j in range(n): _grad += d_pq[i][j] * dy[i][j] * dominator[i][j] grad[i] = _grad * 4 # 更新低维向量 y = y - grad * lr
y -= y.mean(axis=0)
# 词向量展示 SHOW = 150# 为了防止太过密集,选取部分词向量展示
if DIM == 3: from mpl_toolkits.mplot3d import Axes3D # 需要有Axes3D才能绘画 fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(y[:SHOW, 0], y[:SHOW, 1], y[:SHOW, 2], s=1) for i in range(SHOW): word = selected_words[i] ax.text(y[i, 0], y[i, 1], y[i, 2], word, fontsize=10) else: plt.scatter(y[:SHOW, 0], y[:SHOW, 1], s=1) for i in range(SHOW): word = selected_words[i] plt.text(y[i, 0], y[i, 1], word, fontsize=10)