基于MF 的推荐系统 funckSVD算法以及实现

花了点时间重写基于内存的矩阵(matrix)分解(decompose)拟合原始矩阵(matrix)的推荐算法,下一篇将会给出biasSVD算法,整个算法的难点在于梯度(gradient)的计算以及梯度(gradient)的更新,对于批量(batch)梯度(gradient)下降(Gradient Descent)算法可以参照相应的数学公式实现

基于MF 的推荐系统 funckSVD算法以及实现

基于MF 的推荐系统 funckSVD算法以及实现



from numpy import *
import ray
import socket
import pandas as pd
import os
from sklearn.utils import shuffle
from sklearn import preprocessing
from collections import Counter
import time
import progressbar
import matplotlib.pyplot as plt
np.seterr(divide='ignore', invalid='ignore')
this is a function to come true svd model named svd++ algorithm. and using ray freamwork.
name: kenny adelaide
email: kenny13141314@163.com
time: 2021/11/17
# ray.init(address='', _redis_password="5241590000000000")
=================================================common function area==============================================================
def onloaddata():
    onload data to memory.
    Returns:matirx, userno and videono
    dictionary = os.path.dirname(os.path.abspath(__file__))
    path = os.path.join(dictionary, 'data/00000005.csv')
    o_data = pd.read_csv(path)
    userno = o_data['userid'].max() + 1
    videono = o_data['videoid'].max() + 1
    return [o_data, userno, videono]
def build_score_matrix_R(data, userno, videono):
    this is common function for all algorithm-model.
    via original data to build the true score matrix.
        userno: the max number of user code.
        videono:the max number of item code.
    Returns: score matrix
    matrix = [[None] * videono] * userno
    matrix = np.array(matrix)
    # matrix = np.zeros((videono, userno))
    for index, row in data.iterrows():
        matrix[int(row['userid']), int(row['videoid'])] = float(row['score'])
    return matrix
def L2Norm(a, vector):
    result = list(np.dot(vector, vector) * a)[0][0]
    return result
=================================================funck svd==============================================================
def init_P_Q_matrix(user_disms=[3, 3], item_disms=[3, 3], init_method='quadrature'):
     this is a function to create two matrix for sgd training.
    we via quadrature  distribution function.
        user_disms: user matrix shape.
        item_disms: item matrix shape
        init_method: generating matrix approach.
    if str(init_method) == str('quadrature'):
        P = random.randn(user_disms[0], user_disms[1])
        Q = random.randn(item_disms[1], item_disms[0])
        return [P, Q]
def calculate_error(P_matrix, Q_matrix, y_matrix):
    calculating error rator from two matrix.
    rows, cols = np.nonzero(y_matrix != None)
    errors = y_matrix[rows, cols] - np.sum(P_matrix[rows] * Q_matrix.T[cols], axis=1)
    return errors
def gradient(P_matrix, Q_matrix, rows, cols, a, index, error):
    or_row, or_col = rows[index], cols[index]
    P_gradient = -2 * error * Q_matrix[:, or_col] + 2 * a * P_matrix[or_row, :]
    Q_gradient = -2 * error * P_matrix[or_row, :] + 2 * a * Q_matrix[:, or_col]
    return [Q_gradient, P_gradient]
def updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols):
    or_row, or_col = rows[index], cols[index]
    P[or_row, :] -= learning_rate * P_gradient
    Q[:, or_col] -= learning_rate * Q_gradient
    return [P, Q]
def funck_svd():
    train function is ford training svd++ algorithm.
    defined two matrix to fit the orginal rating-matrix.
    Returns: cost and iters count.
    [data, userno, videono] = onloaddata()
    learning_rate = 0.001
    iters = 50000
    a = 0.005
    [P, Q] = init_P_Q_matrix(user_disms=[userno, 2], item_disms=[videono, 2], init_method='quadrature')
    y_matirx = build_score_matrix_R(data, userno, videono)
    if not isinstance(P, np.ndarray):
        P = np.array(P).around(decimals=4)
    if not isinstance(Q, np.ndarray):
        Q = np.array(Q).around(decimals=4)
    if not isinstance(y_matirx, np.ndarray):
        y_matirx = np.array(y_matirx).around(decimals=4)
    rows, cols = np.nonzero(y_matirx != None)
    cost_arr = []
    count = 0
    bar = progressbar
    for i in bar.progressbar(range(iters)):
        errors_matrix = calculate_error(P, Q, y_matirx)
        cost = np.sum(np.square(errors_matrix))
        if cost <= 0.00001:
        for index in range(len(rows)):
            [Q_gradient, P_gradient] = gradient(P, Q, rows, cols, a, index, errors_matrix[index])
            [P, Q] = updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols)
        count += 1
    return cost_arr, count

版权声明:本文为博主kennyadelaide原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。




上一篇 2021年11月22日 下午9:50
下一篇 2021年11月22日 下午10:19

