美文网首页
推荐系统RecSys

推荐系统RecSys

作者: ForgetThatNight | 来源:发表于2018-07-05 22:35 被阅读5次

readers.py

from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd


def read_file(filname, sep="\t"):
    col_names = ["user", "item", "rate", "st"]
    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
    df["user"] -= 1
    df["item"] -= 1
    for col in ("user", "item"):
        df[col] = df[col].astype(np.int32)
    df["rate"] = df["rate"].astype(np.float32)
    return df


class ShuffleIterator(object):
    """
    Randomly generate batches
    """
    def __init__(self, inputs, batch_size=10):
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))

    def __len__(self):
        return self.len

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,))
        out = self.inputs[ids, :]
        return [out[:, i] for i in range(self.num_cols)]


class OneEpochIterator(ShuffleIterator):
    """
    Sequentially generate one-epoch batches, typically for test data
    """
    def __init__(self, inputs, batch_size=10):
        super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0

    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]
# Imports for data io operations
from collections import deque
from six import next
import readers

# Main imports for training
import tensorflow as tf
import numpy as np

# Evaluate train times per epoch
import time

df = readers.read_file("./ml-1m/ratings.dat", sep="::")
rows = len(df)
df
# Purely integer-location based indexing for selection by position
#df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)

Train_RecSys

使用Tensorflow构造隐语义模型的推荐系统

3900 个电影 6,040个用户

数据简介: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt

数据下载地址:http://files.grouplens.org/datasets/movielens/ml-1m.zip

http://www.lfd.uci.edu/~gohlke/pythonlibs/#tensorflow

# Imports for data io operations
from collections import deque
from six import next
import readers

# Main imports for training
import tensorflow as tf
import numpy as np

# Evaluate train times per epoch
import time
# Constant seed for replicating training results
np.random.seed(42)

u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset

batch_size = 1000 # Number of samples per batch
dims = 5          # Dimensions of the data, 15
max_epochs = 50   # Number of times the network sees all the training data

# Device used for all computations
place_device = "/cpu:0"


def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    df = readers.read_file("./ml-1m/ratings.dat", sep="::")
    rows = len(df)
    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    
    return df_train, df_test

def clip(x):
    return np.clip(x, 1.0, 5.0)

def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        with tf.variable_scope('lsi',reuse=True):
            # Using a global bias term
            bias_global = tf.get_variable("bias_global", shape=[])
            # User and item bias variables
            # get_variable: Prefixes the name with the current variable scope 
            # and performs reuse checks.
            w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
            w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
            # embedding_lookup: Looks up 'ids' in a list of embedding tensors
            # Bias embeddings for user and items, given a batch
            bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
            bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
            # User and item weight variables
            w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            # Weight embeddings for user and items, given a batch
            embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
            embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    
    with tf.device(device):
        # reduce_sum: Computes the sum of elements across dimensions of a tensor
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        # l2_loss: Computes half the L2 norm of a tensor without the sqrt
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer

def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
    with tf.device(device):
        # Use L2 loss to compute penalty
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        # 'Follow the Regularized Leader' optimizer
        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    return cost, train_op
# Read data from ratings file to build a TF model
df_train, df_test = get_data()

samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" % 
      (len(df_train), len(df_test), samples_per_batch))

输出 : Number of train samples 900188, test samples 100021, samples per batch 900

# Peeking at the top 5 user values
print(df_train["user"].head()) 
print(df_test["user"].head())

输出 :
0 1463
1 1260
2 1205
3 4657
4 5604
Name: user, dtype: int32
0 4138
1 5233
2 1419
3 2019
4 2786
Name: user, dtype: int32

# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())

输出 :
0 2114
1 1209
2 328
3 615
4 2500
Name: item, dtype: int32
0 1403
1 3741
2 1731
3 2114
4 3104
Name: item, dtype: int32

# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())

输出 :
0 2.0
1 5.0
2 4.0
3 2.0
4 4.0
Name: rate, dtype: float32
0 3.0
1 5.0
2 5.0
3 3.0
4 3.0
Name: rate, dtype: float32

# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                     df_train["rate"]],
                                     batch_size=batch_size)

# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                     df_test["rate"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
        mul                                                       rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/')

输出 :
Epoch Train Error Val Error Elapsed Time
00 2.817 1.114 0.034 secs
01 1.047 1.003 1.028 secs
02 0.982 0.968 1.038 secs
03 0.955 0.950 1.011 secs
04 0.941 0.940 1.005 secs
05 0.932 0.934 1.092 secs
06 0.927 0.929 1.317 secs
07 0.923 0.926 1.175 secs
08 0.918 0.923 1.117 secs
09 0.916 0.921 1.013 secs
10 0.914 0.919 1.019 secs
11 0.911 0.918 1.032 secs
12 0.910 0.917 1.037 secs
13 0.909 0.917 1.235 secs
14 0.908 0.915 1.156 secs
15 0.907 0.914 1.420 secs
16 0.907 0.914 1.324 secs
17 0.905 0.914 1.134 secs
18 0.904 0.914 1.020 secs
19 0.904 0.913 1.022 secs
20 0.904 0.913 1.065 secs
21 0.903 0.912 1.005 secs
22 0.902 0.912 1.006 secs
23 0.903 0.911 1.020 secs
24 0.902 0.911 1.036 secs
25 0.901 0.911 1.071 secs
26 0.902 0.912 1.014 secs
27 0.900 0.911 0.994 secs
28 0.901 0.911 1.014 secs
29 0.902 0.910 1.007 secs
30 0.902 0.911 1.046 secs
31 0.901 0.910 0.996 secs
32 0.899 0.910 0.996 secs
33 0.900 0.910 1.010 secs
34 0.899 0.911 1.010 secs
35 0.900 0.910 1.037 secs
36 0.899 0.910 0.999 secs
37 0.900 0.911 0.990 secs
38 0.900 0.910 1.010 secs
39 0.900 0.910 1.009 secs
40 0.899 0.910 1.040 secs
41 0.900 0.911 0.994 secs
42 0.900 0.910 0.996 secs
43 0.898 0.911 1.013 secs
44 0.899 0.910 1.013 secs
45 0.899 0.910 1.036 secs
46 0.899 0.910 0.999 secs
47 0.897 0.909 0.993 secs
48 0.899 0.910 1.012 secs
49 0.900 0.910 1.007 secs


Test_RecSys

Recommendation Systems using TensorFlow

Builds a recommendation engine using Movie Lens data

We use the 1M data set for building our recommendation engine. The 1M data contain 1,000,209 anonymous ratings

  • Ratings for approximately 3,900 movies
  • Ratings provided by 6,040 MovieLens users who joined MovieLens in 2000

More information about the data can be viewed at: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt

# Imports for data io operations
from collections import deque
from six import next
import readers

# Main imports for training
import tensorflow as tf
import numpy as np

# Evaluate train times per epoch
import time

# Constant seed for replicating training results
np.random.seed(42)

u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset

batch_size = 1000 # Number of samples per batch
dims = 15         # Dimensions of the data, 15
max_epochs = 25   # Number of times the network sees all the training data

# Device used for all computations
place_device = "/cpu:0"
def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    df = readers.read_file("ratings.dat", sep="::")
    rows = len(df)
    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    
    return df_train, df_test

def clip(x):
    return np.clip(x, 1.0, 5.0)


def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        # Using a global bias term
        bias_global = tf.get_variable("bias_global", shape=[])
        # User and item bias variables
        # get_variable: Prefixes the name with the current variable scope 
        # and performs reuse checks.
        w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
        w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
        # embedding_lookup: Looks up 'ids' in a list of embedding tensors
        # Bias embeddings for user and items, given a batch
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
        # User and item weight variables
        w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        # Weight embeddings for user and items, given a batch
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    
    with tf.device(device):
        # reduce_sum: Computes the sum of elements across dimensions of a tensor
        infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        # l2_loss: Computes half the L2 norm of a tensor without the sqrt
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer


def loss(infer, regularizer, rate_batch, learning_rate=0.1, reg=0.1, device="/cpu:0"):
    with tf.device(device):
        # Use L2 loss to compute penalty
        cost_l2 = tf.nn.l2_loss(tf.sub(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.mul(regularizer, penalty))
        # 'Follow the Regularized Leader' optimizer
        # Reference: http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
        train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)
    return cost, train_op
# Read data from ratings file to build a TF model
df_train, df_test = get_data()

samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" % 
      (len(df_train), len(df_test), samples_per_batch))

输出 : Number of train samples 900188, test samples 100021, samples per batch 900

# Peeking at the top 5 user values
print(df_train["user"].head()) 
print(df_test["user"].head())

输出 :
0 5411
1 5439
2 367
3 424
4 4941
Name: user, dtype: int32
0 1696
1 5448
2 2242
3 5629
4 423
Name: user, dtype: int32

# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())

输出 :
0 2682
1 903
2 3716
3 1720
4 3696
Name: item, dtype: int32
0 3113
1 1195
2 749
3 3623
4 2899
Name: item, dtype: int32

# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())

输出 :
0 2.0
1 5.0
2 4.0
3 4.0
4 1.0
Name: rate, dtype: float32
0 5.0
1 5.0
2 5.0
3 2.0
4 2.0
Name: rate, dtype: float32

# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                     df_train["rate"]],
                                     batch_size=batch_size)

# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                     df_test["rate"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.10, reg=0.05, device=place_device)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
                                                               rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/model')
# Inference using saved model
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    #sess.run(init_op)
    new_saver = tf.train.import_meta_graph('./save/model.meta')
    new_saver.restore(sess, tf.train.latest_checkpoint('./save/'))
    test_err2 = np.array([])
    for users, items, rates in iter_test:
        pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                item_batch: items})
        pred_batch = clip(pred_batch)
        print("Pred\tActual")
        for ii in range(10):
            print("%.3f\t%.3f" % (pred_batch[ii], rates[ii]))
        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
        print(np.sqrt(np.mean(test_err2)))

输出 :
Pred Actual
4.982 5.000
4.486 5.000
5.000 5.000
4.031 2.000
2.908 2.000
4.500 5.000
2.892 3.000
3.943 4.000
1.485 2.000
4.104 1.000
0.850395450933

相关文章

网友评论

      本文标题:推荐系统RecSys

      本文链接:https://www.haomeiwen.com/subject/smemuftx.html