readers.py
from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
def read_file(filname, sep="\t"):
col_names = ["user", "item", "rate", "st"]
df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
df["user"] -= 1
df["item"] -= 1
for col in ("user", "item"):
df[col] = df[col].astype(np.int32)
df["rate"] = df["rate"].astype(np.float32)
return df
class ShuffleIterator(object):
"""
Randomly generate batches
"""
def __init__(self, inputs, batch_size=10):
self.inputs = inputs
self.batch_size = batch_size
self.num_cols = len(self.inputs)
self.len = len(self.inputs[0])
self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))
def __len__(self):
return self.len
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
ids = np.random.randint(0, self.len, (self.batch_size,))
out = self.inputs[ids, :]
return [out[:, i] for i in range(self.num_cols)]
class OneEpochIterator(ShuffleIterator):
"""
Sequentially generate one-epoch batches, typically for test data
"""
def __init__(self, inputs, batch_size=10):
super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
if batch_size > 0:
self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
else:
self.idx_group = [np.arange(self.len)]
self.group_id = 0
def next(self):
if self.group_id >= len(self.idx_group):
self.group_id = 0
raise StopIteration
out = self.inputs[self.idx_group[self.group_id], :]
self.group_id += 1
return [out[:, i] for i in range(self.num_cols)]
# Imports for data io operations
from collections import deque
from six import next
import readers
# Main imports for training
import tensorflow as tf
import numpy as np
# Evaluate train times per epoch
import time
df = readers.read_file("./ml-1m/ratings.dat", sep="::")
rows = len(df)
df
# Purely integer-location based indexing for selection by position
#df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)


Train_RecSys
使用Tensorflow构造隐语义模型的推荐系统
3900 个电影 6,040个用户
数据简介: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt
数据下载地址:http://files.grouplens.org/datasets/movielens/ml-1m.zip
http://www.lfd.uci.edu/~gohlke/pythonlibs/#tensorflow
# Imports for data io operations
from collections import deque
from six import next
import readers
# Main imports for training
import tensorflow as tf
import numpy as np
# Evaluate train times per epoch
import time
# Constant seed for replicating training results
np.random.seed(42)
u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset
batch_size = 1000 # Number of samples per batch
dims = 5 # Dimensions of the data, 15
max_epochs = 50 # Number of times the network sees all the training data
# Device used for all computations
place_device = "/cpu:0"
def get_data():
# Reads file using the demiliter :: form the ratings file
# Columns are user ID, item ID, rating, and timestamp
# Sample data - 3::1196::4::978297539
df = readers.read_file("./ml-1m/ratings.dat", sep="::")
rows = len(df)
# Purely integer-location based indexing for selection by position
df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
# Separate data into train and test, 90% for train and 10% for test
split_index = int(rows * 0.9)
# Use indices to separate the data
df_train = df[0:split_index]
df_test = df[split_index:].reset_index(drop=True)
return df_train, df_test
def clip(x):
return np.clip(x, 1.0, 5.0)
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
with tf.device("/cpu:0"):
with tf.variable_scope('lsi',reuse=True):
# Using a global bias term
bias_global = tf.get_variable("bias_global", shape=[])
# User and item bias variables
# get_variable: Prefixes the name with the current variable scope
# and performs reuse checks.
w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
# embedding_lookup: Looks up 'ids' in a list of embedding tensors
# Bias embeddings for user and items, given a batch
bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
# User and item weight variables
w_user = tf.get_variable("embd_user", shape=[user_num, dim],
initializer=tf.truncated_normal_initializer(stddev=0.02))
w_item = tf.get_variable("embd_item", shape=[item_num, dim],
initializer=tf.truncated_normal_initializer(stddev=0.02))
# Weight embeddings for user and items, given a batch
embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
with tf.device(device):
# reduce_sum: Computes the sum of elements across dimensions of a tensor
infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
infer = tf.add(infer, bias_global)
infer = tf.add(infer, bias_user)
infer = tf.add(infer, bias_item, name="svd_inference")
# l2_loss: Computes half the L2 norm of a tensor without the sqrt
regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item),
name="svd_regularizer")
return infer, regularizer
def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
with tf.device(device):
# Use L2 loss to compute penalty
cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
# 'Follow the Regularized Leader' optimizer
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
return cost, train_op
# Read data from ratings file to build a TF model
df_train, df_test = get_data()
samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" %
(len(df_train), len(df_test), samples_per_batch))
输出 : Number of train samples 900188, test samples 100021, samples per batch 900
# Peeking at the top 5 user values
print(df_train["user"].head())
print(df_test["user"].head())
输出 :
0 1463
1 1260
2 1205
3 4657
4 5604
Name: user, dtype: int32
0 4138
1 5233
2 1419
3 2019
4 2786
Name: user, dtype: int32
# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())
输出 :
0 2114
1 1209
2 328
3 615
4 2500
Name: item, dtype: int32
0 1403
1 3741
2 1731
3 2114
4 3104
Name: item, dtype: int32
# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())
输出 :
0 2.0
1 5.0
2 4.0
3 2.0
4 4.0
Name: rate, dtype: float32
0 3.0
1 5.0
2 5.0
3 3.0
4 3.0
Name: rate, dtype: float32
# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
df_train["item"],
df_train["rate"]],
batch_size=batch_size)
# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
df_test["item"],
df_test["rate"]],
batch_size=-1)
user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])
infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
errors = deque(maxlen=samples_per_batch)
start = time.time()
for i in range(max_epochs * samples_per_batch):
users, items, rates = next(iter_train)
_, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
item_batch: items,
mul rate_batch: rates})
pred_batch = clip(pred_batch)
errors.append(np.power(pred_batch - rates, 2))
if i % samples_per_batch == 0:
train_err = np.sqrt(np.mean(errors))
test_err2 = np.array([])
for users, items, rates in iter_test:
pred_batch = sess.run(infer, feed_dict={user_batch: users,
item_batch: items})
pred_batch = clip(pred_batch)
test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
end = time.time()
print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
start = end
saver.save(sess, './save/')
输出 :
Epoch Train Error Val Error Elapsed Time
00 2.817 1.114 0.034 secs
01 1.047 1.003 1.028 secs
02 0.982 0.968 1.038 secs
03 0.955 0.950 1.011 secs
04 0.941 0.940 1.005 secs
05 0.932 0.934 1.092 secs
06 0.927 0.929 1.317 secs
07 0.923 0.926 1.175 secs
08 0.918 0.923 1.117 secs
09 0.916 0.921 1.013 secs
10 0.914 0.919 1.019 secs
11 0.911 0.918 1.032 secs
12 0.910 0.917 1.037 secs
13 0.909 0.917 1.235 secs
14 0.908 0.915 1.156 secs
15 0.907 0.914 1.420 secs
16 0.907 0.914 1.324 secs
17 0.905 0.914 1.134 secs
18 0.904 0.914 1.020 secs
19 0.904 0.913 1.022 secs
20 0.904 0.913 1.065 secs
21 0.903 0.912 1.005 secs
22 0.902 0.912 1.006 secs
23 0.903 0.911 1.020 secs
24 0.902 0.911 1.036 secs
25 0.901 0.911 1.071 secs
26 0.902 0.912 1.014 secs
27 0.900 0.911 0.994 secs
28 0.901 0.911 1.014 secs
29 0.902 0.910 1.007 secs
30 0.902 0.911 1.046 secs
31 0.901 0.910 0.996 secs
32 0.899 0.910 0.996 secs
33 0.900 0.910 1.010 secs
34 0.899 0.911 1.010 secs
35 0.900 0.910 1.037 secs
36 0.899 0.910 0.999 secs
37 0.900 0.911 0.990 secs
38 0.900 0.910 1.010 secs
39 0.900 0.910 1.009 secs
40 0.899 0.910 1.040 secs
41 0.900 0.911 0.994 secs
42 0.900 0.910 0.996 secs
43 0.898 0.911 1.013 secs
44 0.899 0.910 1.013 secs
45 0.899 0.910 1.036 secs
46 0.899 0.910 0.999 secs
47 0.897 0.909 0.993 secs
48 0.899 0.910 1.012 secs
49 0.900 0.910 1.007 secs

Test_RecSys
Recommendation Systems using TensorFlow
Builds a recommendation engine using Movie Lens data
We use the 1M data set for building our recommendation engine. The 1M data contain 1,000,209 anonymous ratings
- Ratings for approximately 3,900 movies
- Ratings provided by 6,040 MovieLens users who joined MovieLens in 2000
More information about the data can be viewed at: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt
# Imports for data io operations
from collections import deque
from six import next
import readers
# Main imports for training
import tensorflow as tf
import numpy as np
# Evaluate train times per epoch
import time
# Constant seed for replicating training results
np.random.seed(42)
u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset
batch_size = 1000 # Number of samples per batch
dims = 15 # Dimensions of the data, 15
max_epochs = 25 # Number of times the network sees all the training data
# Device used for all computations
place_device = "/cpu:0"
def get_data():
# Reads file using the demiliter :: form the ratings file
# Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip
# Columns are user ID, item ID, rating, and timestamp
# Sample data - 3::1196::4::978297539
df = readers.read_file("ratings.dat", sep="::")
rows = len(df)
# Purely integer-location based indexing for selection by position
df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
# Separate data into train and test, 90% for train and 10% for test
split_index = int(rows * 0.9)
# Use indices to separate the data
df_train = df[0:split_index]
df_test = df[split_index:].reset_index(drop=True)
return df_train, df_test
def clip(x):
return np.clip(x, 1.0, 5.0)
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
with tf.device("/cpu:0"):
# Using a global bias term
bias_global = tf.get_variable("bias_global", shape=[])
# User and item bias variables
# get_variable: Prefixes the name with the current variable scope
# and performs reuse checks.
w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
# embedding_lookup: Looks up 'ids' in a list of embedding tensors
# Bias embeddings for user and items, given a batch
bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
# User and item weight variables
w_user = tf.get_variable("embd_user", shape=[user_num, dim],
initializer=tf.truncated_normal_initializer(stddev=0.02))
w_item = tf.get_variable("embd_item", shape=[item_num, dim],
initializer=tf.truncated_normal_initializer(stddev=0.02))
# Weight embeddings for user and items, given a batch
embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
with tf.device(device):
# reduce_sum: Computes the sum of elements across dimensions of a tensor
infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1)
infer = tf.add(infer, bias_global)
infer = tf.add(infer, bias_user)
infer = tf.add(infer, bias_item, name="svd_inference")
# l2_loss: Computes half the L2 norm of a tensor without the sqrt
regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item),
name="svd_regularizer")
return infer, regularizer
def loss(infer, regularizer, rate_batch, learning_rate=0.1, reg=0.1, device="/cpu:0"):
with tf.device(device):
# Use L2 loss to compute penalty
cost_l2 = tf.nn.l2_loss(tf.sub(infer, rate_batch))
penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
cost = tf.add(cost_l2, tf.mul(regularizer, penalty))
# 'Follow the Regularized Leader' optimizer
# Reference: http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)
return cost, train_op
# Read data from ratings file to build a TF model
df_train, df_test = get_data()
samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" %
(len(df_train), len(df_test), samples_per_batch))
输出 : Number of train samples 900188, test samples 100021, samples per batch 900
# Peeking at the top 5 user values
print(df_train["user"].head())
print(df_test["user"].head())
输出 :
0 5411
1 5439
2 367
3 424
4 4941
Name: user, dtype: int32
0 1696
1 5448
2 2242
3 5629
4 423
Name: user, dtype: int32
# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())
输出 :
0 2682
1 903
2 3716
3 1720
4 3696
Name: item, dtype: int32
0 3113
1 1195
2 749
3 3623
4 2899
Name: item, dtype: int32
# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())
输出 :
0 2.0
1 5.0
2 4.0
3 4.0
4 1.0
Name: rate, dtype: float32
0 5.0
1 5.0
2 5.0
3 2.0
4 2.0
Name: rate, dtype: float32
# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
df_train["item"],
df_train["rate"]],
batch_size=batch_size)
# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
df_test["item"],
df_test["rate"]],
batch_size=-1)
user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])
infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.10, reg=0.05, device=place_device)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
errors = deque(maxlen=samples_per_batch)
start = time.time()
for i in range(max_epochs * samples_per_batch):
users, items, rates = next(iter_train)
_, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
item_batch: items,
rate_batch: rates})
pred_batch = clip(pred_batch)
errors.append(np.power(pred_batch - rates, 2))
if i % samples_per_batch == 0:
train_err = np.sqrt(np.mean(errors))
test_err2 = np.array([])
for users, items, rates in iter_test:
pred_batch = sess.run(infer, feed_dict={user_batch: users,
item_batch: items})
pred_batch = clip(pred_batch)
test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
end = time.time()
print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
start = end
saver.save(sess, './save/model')

# Inference using saved model
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
#sess.run(init_op)
new_saver = tf.train.import_meta_graph('./save/model.meta')
new_saver.restore(sess, tf.train.latest_checkpoint('./save/'))
test_err2 = np.array([])
for users, items, rates in iter_test:
pred_batch = sess.run(infer, feed_dict={user_batch: users,
item_batch: items})
pred_batch = clip(pred_batch)
print("Pred\tActual")
for ii in range(10):
print("%.3f\t%.3f" % (pred_batch[ii], rates[ii]))
test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
print(np.sqrt(np.mean(test_err2)))
输出 :
Pred Actual
4.982 5.000
4.486 5.000
5.000 5.000
4.031 2.000
2.908 2.000
4.500 5.000
2.892 3.000
3.943 4.000
1.485 2.000
4.104 1.000
0.850395450933
网友评论