readers.py

from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd


def read_file(filname, sep="\t"):
    col_names = ["user", "item", "rate", "st"]
    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
    df["user"] -= 1
    df["item"] -= 1
    for col in ("user", "item"):
        df[col] = df[col].astype(np.int32)
    df["rate"] = df["rate"].astype(np.float32)
    return df


class ShuffleIterator(object):
    """
    Randomly generate batches
    """
    def __init__(self, inputs, batch_size=10):
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))

    def __len__(self):
        return self.len

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,))
        out = self.inputs[ids, :]
        return [out[:, i] for i in range(self.num_cols)]


class OneEpochIterator(ShuffleIterator):
    """
    Sequentially generate one-epoch batches, typically for test data
    """
    def __init__(self, inputs, batch_size=10):
        super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0

    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]

# Imports for data io operations
from collections import deque
from six import next
import readers

# Main imports for training
import tensorflow as tf
import numpy as np

# Evaluate train times per epoch
import time

df = readers.read_file("./ml-1m/ratings.dat", sep="::")
rows = len(df)
df
# Purely integer-location based indexing for selection by position
#df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)

Train_RecSys

使用Tensorflow构造隐语义模型的推荐系统

3900 个电影 6,040个用户

数据简介: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt

数据下载地址：http://files.grouplens.org/datasets/movielens/ml-1m.zip

http://www.lfd.uci.edu/~gohlke/pythonlibs/#tensorflow

# Imports for data io operations
from collections import deque
from six import next
import readers

# Main imports for training
import tensorflow as tf
import numpy as np

# Evaluate train times per epoch
import time

# Constant seed for replicating training results
np.random.seed(42)

u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset

batch_size = 1000 # Number of samples per batch
dims = 5          # Dimensions of the data, 15
max_epochs = 50   # Number of times the network sees all the training data

# Device used for all computations
place_device = "/cpu:0"

def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    df = readers.read_file("./ml-1m/ratings.dat", sep="::")
    rows = len(df)
    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    
    return df_train, df_test

def clip(x):
    return np.clip(x, 1.0, 5.0)

def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        with tf.variable_scope('lsi',reuse=True):
            # Using a global bias term
            bias_global = tf.get_variable("bias_global", shape=[])
            # User and item bias variables
            # get_variable: Prefixes the name with the current variable scope 
            # and performs reuse checks.
            w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
            w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
            # embedding_lookup: Looks up 'ids' in a list of embedding tensors
            # Bias embeddings for user and items, given a batch
            bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
            bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
            # User and item weight variables
            w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            # Weight embeddings for user and items, given a batch
            embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
            embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    
    with tf.device(device):
        # reduce_sum: Computes the sum of elements across dimensions of a tensor
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        # l2_loss: Computes half the L2 norm of a tensor without the sqrt
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer

def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
    with tf.device(device):
        # Use L2 loss to compute penalty
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        # 'Follow the Regularized Leader' optimizer
        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    return cost, train_op

# Read data from ratings file to build a TF model
df_train, df_test = get_data()

samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" % 
      (len(df_train), len(df_test), samples_per_batch))

输出： Number of train samples 900188, test samples 100021, samples per batch 900

# Peeking at the top 5 user values
print(df_train["user"].head()) 
print(df_test["user"].head())

输出：
0 1463
1 1260
2 1205
3 4657
4 5604
Name: user, dtype: int32
0 4138
1 5233
2 1419
3 2019
4 2786
Name: user, dtype: int32

# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())

输出：
0 2114
1 1209
2 328
3 615
4 2500
Name: item, dtype: int32
0 1403
1 3741
2 1731
3 2114
4 3104
Name: item, dtype: int32

# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())

输出：
0 2.0
1 5.0
2 4.0
3 2.0
4 4.0
Name: rate, dtype: float32
0 3.0
1 5.0
2 5.0
3 3.0
4 3.0
Name: rate, dtype: float32

# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                     df_train["rate"]],
                                     batch_size=batch_size)

# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                     df_test["rate"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)

init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
        mul                                                       rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/')

输出：
Epoch Train Error Val Error Elapsed Time
00 2.817 1.114 0.034 secs
01 1.047 1.003 1.028 secs
02 0.982 0.968 1.038 secs
03 0.955 0.950 1.011 secs
04 0.941 0.940 1.005 secs
05 0.932 0.934 1.092 secs
06 0.927 0.929 1.317 secs
07 0.923 0.926 1.175 secs
08 0.918 0.923 1.117 secs
09 0.916 0.921 1.013 secs
10 0.914 0.919 1.019 secs
11 0.911 0.918 1.032 secs
12 0.910 0.917 1.037 secs
13 0.909 0.917 1.235 secs
14 0.908 0.915 1.156 secs
15 0.907 0.914 1.420 secs
16 0.907 0.914 1.324 secs
17 0.905 0.914 1.134 secs
18 0.904 0.914 1.020 secs
19 0.904 0.913 1.022 secs
20 0.904 0.913 1.065 secs
21 0.903 0.912 1.005 secs
22 0.902 0.912 1.006 secs
23 0.903 0.911 1.020 secs
24 0.902 0.911 1.036 secs
25 0.901 0.911 1.071 secs
26 0.902 0.912 1.014 secs
27 0.900 0.911 0.994 secs
28 0.901 0.911 1.014 secs
29 0.902 0.910 1.007 secs
30 0.902 0.911 1.046 secs
31 0.901 0.910 0.996 secs
32 0.899 0.910 0.996 secs
33 0.900 0.910 1.010 secs
34 0.899 0.911 1.010 secs
35 0.900 0.910 1.037 secs
36 0.899 0.910 0.999 secs
37 0.900 0.911 0.990 secs
38 0.900 0.910 1.010 secs
39 0.900 0.910 1.009 secs
40 0.899 0.910 1.040 secs
41 0.900 0.911 0.994 secs
42 0.900 0.910 0.996 secs
43 0.898 0.911 1.013 secs
44 0.899 0.910 1.013 secs
45 0.899 0.910 1.036 secs
46 0.899 0.910 0.999 secs
47 0.897 0.909 0.993 secs
48 0.899 0.910 1.012 secs
49 0.900 0.910 1.007 secs

Test_RecSys

Recommendation Systems using TensorFlow

Builds a recommendation engine using Movie Lens data

We use the 1M data set for building our recommendation engine. The 1M data contain 1,000,209 anonymous ratings

Ratings for approximately 3,900 movies
Ratings provided by 6,040 MovieLens users who joined MovieLens in 2000

More information about the data can be viewed at: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt

# Imports for data io operations
from collections import deque
from six import next
import readers

# Main imports for training
import tensorflow as tf
import numpy as np

# Evaluate train times per epoch
import time

# Constant seed for replicating training results
np.random.seed(42)

u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset

batch_size = 1000 # Number of samples per batch
dims = 15         # Dimensions of the data, 15
max_epochs = 25   # Number of times the network sees all the training data

# Device used for all computations
place_device = "/cpu:0"

def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    df = readers.read_file("ratings.dat", sep="::")
    rows = len(df)
    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    
    return df_train, df_test

def clip(x):
    return np.clip(x, 1.0, 5.0)


def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        # Using a global bias term
        bias_global = tf.get_variable("bias_global", shape=[])
        # User and item bias variables
        # get_variable: Prefixes the name with the current variable scope 
        # and performs reuse checks.
        w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
        w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
        # embedding_lookup: Looks up 'ids' in a list of embedding tensors
        # Bias embeddings for user and items, given a batch
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
        # User and item weight variables
        w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        # Weight embeddings for user and items, given a batch
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    
    with tf.device(device):
        # reduce_sum: Computes the sum of elements across dimensions of a tensor
        infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        # l2_loss: Computes half the L2 norm of a tensor without the sqrt
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer


def loss(infer, regularizer, rate_batch, learning_rate=0.1, reg=0.1, device="/cpu:0"):
    with tf.device(device):
        # Use L2 loss to compute penalty
        cost_l2 = tf.nn.l2_loss(tf.sub(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.mul(regularizer, penalty))
        # 'Follow the Regularized Leader' optimizer
        # Reference: http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
        train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)
    return cost, train_op

# Read data from ratings file to build a TF model
df_train, df_test = get_data()

samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" % 
      (len(df_train), len(df_test), samples_per_batch))

输出： Number of train samples 900188, test samples 100021, samples per batch 900

# Peeking at the top 5 user values
print(df_train["user"].head()) 
print(df_test["user"].head())

输出：
0 5411
1 5439
2 367
3 424
4 4941
Name: user, dtype: int32
0 1696
1 5448
2 2242
3 5629
4 423
Name: user, dtype: int32

# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())

输出：
0 2682
1 903
2 3716
3 1720
4 3696
Name: item, dtype: int32
0 3113
1 1195
2 749
3 3623
4 2899
Name: item, dtype: int32

# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())

输出：
0 2.0
1 5.0
2 4.0
3 4.0
4 1.0
Name: rate, dtype: float32
0 5.0
1 5.0
2 5.0
3 2.0
4 2.0
Name: rate, dtype: float32

# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                     df_train["rate"]],
                                     batch_size=batch_size)

# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                     df_test["rate"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.10, reg=0.05, device=place_device)

saver = tf.train.Saver()
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
                                                               rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/model')

# Inference using saved model
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    #sess.run(init_op)
    new_saver = tf.train.import_meta_graph('./save/model.meta')
    new_saver.restore(sess, tf.train.latest_checkpoint('./save/'))
    test_err2 = np.array([])
    for users, items, rates in iter_test:
        pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                item_batch: items})
        pred_batch = clip(pred_batch)
        print("Pred\tActual")
        for ii in range(10):
            print("%.3f\t%.3f" % (pred_batch[ii], rates[ii]))
        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
        print(np.sqrt(np.mean(test_err2)))

输出：
Pred Actual
4.982 5.000
4.486 5.000
5.000 5.000
4.031 2.000
2.908 2.000
4.500 5.000
2.892 3.000
3.943 4.000
1.485 2.000
4.104 1.000
0.850395450933

推荐系统RecSys
readers.py Train_RecSys 使用Tensorflow构造隐语义模型的推荐系统 3900 个电影...
RecSys2019会议总结
第13届推荐系统顶会RecSys在丹麦哥本哈根举行，今年我还是中了一篇短文。今年的RecSys人数历史最多，达到8...
关于推荐系统，RecSys 2019大会都讨论了什么？（附论文下
关键词：推荐系统、深度学习、以用户为中心、可再现性和多任务处理 2019年的推荐系统大会（Recsys）于今年的...
推荐系统评价（混合方法）
背景这其实是recsys 2018的一个tutorial，关于推荐系统评价，从定量和定性的混合角度。PPT地址在...
实现推荐系统的 python 库 Python-recsys
原文来自开源中国：侵删前言 python-recsys是一个用来实现推荐系统的python库。安装依赖项 p...
CRS(2)阅读笔记:2019RecSys-Deep Langu
前言 2019-推荐专会Recsys 长文：Critiquing-based CRS Critiquing(评价)...
基于图的推荐算法(2): HOP-Rec: High-Order
前言 RecSys2018: 融合图结构、并结合MF思想的推荐算法相关研究参见：基于图的推荐算法(1): Que...
基于图的推荐算法(1): Query-based Music R
前言发表在Recsys2016，是比较经典的基于图嵌入思想的推荐算法2959100.2959169 (acm.o...
RecSys-基于内容的推荐系统：前言和趋势
Test
python-recsys使用说明（参考）
Q：python-recsys是什么东西？Ａ：python-recsys是 A python library f...