数据集介绍

2024-09-03 14:18:41 +08:00
parent 976d60981f
commit 6fd3b67ed8
6 changed files with 181 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -6,15 +6,9 @@
 利用统计学的相关系数经常皮尔森（pearson）相关系数计算相关系数来实现千人千面的推荐系统。
-## 我的博客
+## 数据集介绍
 1. **JAVA OPC UA专栏**：[https://blog.csdn.net/weixin_40986713/category_12356608.html](https://blog.csdn.net/weixin_40986713/category_12356608.html)
 2. **AI绘画 | Stable diffusion**：[https://blog.csdn.net/weixin_40986713/category_12481790.html](https://blog.csdn.net/weixin_40986713/category_12481790.html) 
 3. **java高级技术专栏**：[https://blog.csdn.net/weixin_40986713/category_10796066.html](https://blog.csdn.net/weixin_40986713/category_10796066.html)
 4. **java Selenium自动化爬虫**：[https://blog.csdn.net/weixin_40986713/category_12165790.html](https://blog.csdn.net/weixin_40986713/category_12165790.html)
 5. **java 推荐算法专栏**：[https://blog.csdn.net/weixin_40986713/category_12268014.html](https://blog.csdn.net/weixin_40986713/category_12268014.html)
 6. **Java视频图像处理专栏**：[https://blog.csdn.net/weixin_40986713/category_11109931.html](https://blog.csdn.net/weixin_40986713/category_11109931.html) 
 https://grouplens.org/datasets/movielens/100k/
 #### 协同过滤算法
 协同过滤推荐算法是诞生最早，并且较为著名的推荐算法。主要的功能是预测和推荐。协同过滤(Collaborative Filtering,简写CF)是推荐系统最重要得思想之一，其思想是根据用户之前得喜好以及其他兴趣相近得用户得选择来给用户推荐物品(基于对用户历史行为数据的挖掘发现用户的喜好偏向，并预测用户可能喜好的产品进行推荐)，一般仅仅基于用户的行为数据（评价，购买，下载等），而不依赖于物品的任何附加信息（物品自身特征）或者用户的任何附加信息（年龄，性别等）。其思想总的来说就是：人以类聚，物以群分。
--- a/src/main/java/com/tarzan/recommend/core/ItemCF.java
+++ b/src/main/java/com/tarzan/recommend/core/ItemCF.java
@@ -19,18 +19,18 @@ public class ItemCF {
     * 方法描述: 推荐电影id列表
     *
     * @param itemId 当前电影id
-     * @param list 用户电影评分数据
+     * @param list   用户电影评分数据
     * @return {@link List<Integer>}
     * @date 2023年02月02日 14:51:42
     */
    public static List<Integer> recommend(Integer itemId, List<RelateDTO> list) {
        //按物品分组
-        Map<Integer, List<RelateDTO>>  itemMap=list.stream().collect(Collectors.groupingBy(RelateDTO::getItemId));
+        Map<Integer, List<RelateDTO>> itemMap = list.stream().collect(Collectors.groupingBy(RelateDTO::getItemId));
        //获取其他物品与当前物品的关系值
-        Map<Integer,Double>  itemDisMap = CoreMath.computeNeighbor(itemId, itemMap,1);
+        Map<Integer, Double> itemDisMap = CoreMath.computeNeighbor(itemId, itemMap, 1);
        //获取关系最近物品
-        double maxValue=Collections.max(itemDisMap.values());
+        double maxValue = Collections.max(itemDisMap.values());
-        return itemDisMap.entrySet().stream().filter(e->e.getValue()==maxValue).map(Map.Entry::getKey).collect(Collectors.toList());
+        return itemDisMap.entrySet().stream().filter(e -> e.getValue() == maxValue).map(Map.Entry::getKey).collect(Collectors.toList());
    }
--- a/src/main/java/com/tarzan/recommend/core/UserCF.java
+++ b/src/main/java/com/tarzan/recommend/core/UserCF.java
@@ -19,27 +19,27 @@ public class UserCF {
     * 方法描述: 推荐电影id列表
     *
     * @param userId 当前用户
-     * @param list 用户电影评分数据
+     * @param list   用户电影评分数据
     * @return {@link List<Integer>}
     * @date 2023年02月02日 14:51:42
     */
    public static List<Integer> recommend(Integer userId, List<RelateDTO> list) {
        //按用户分组
-        Map<Integer, List<RelateDTO>>  userMap=list.stream().collect(Collectors.groupingBy(RelateDTO::getUseId));
+        Map<Integer, List<RelateDTO>> userMap = list.stream().collect(Collectors.groupingBy(RelateDTO::getUseId));
        //获取其他用户与当前用户的关系值
-        Map<Integer,Double>  userDisMap = CoreMath.computeNeighbor(userId, userMap,0);
+        Map<Integer, Double> userDisMap = CoreMath.computeNeighbor(userId, userMap, 0);
        //获取关系最近的用户
-        double maxValue=Collections.max(userDisMap.values());
+        double maxValue = Collections.max(userDisMap.values());
-        Set<Integer> userIds=userDisMap.entrySet().stream().filter(e->e.getValue()==maxValue).map(Map.Entry::getKey).collect(Collectors.toSet());
+        Set<Integer> userIds = userDisMap.entrySet().stream().filter(e -> e.getValue() == maxValue).map(Map.Entry::getKey).collect(Collectors.toSet());
        //取关系最近的用户
        Integer nearestUserId = userIds.stream().findAny().orElse(null);
-        if(nearestUserId==null){
+        if (nearestUserId == null) {
            return Collections.emptyList();
        }
        //最近邻用户看过电影列表
-        List<Integer>  neighborItems = userMap.get(nearestUserId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
+        List<Integer> neighborItems = userMap.get(nearestUserId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
        //指定用户看过电影列表
-        List<Integer>  userItems  = userMap.get(userId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
+        List<Integer> userItems = userMap.get(userId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
        //找到最近邻看过，但是该用户没看过的电影
        neighborItems.removeAll(userItems);
        return neighborItems;
--- a/src/main/java/com/tarzan/recommend/service/FileDataSource.java
+++ b/src/main/java/com/tarzan/recommend/service/FileDataSource.java
@@ -20,10 +20,11 @@ import java.util.Objects;
 public class FileDataSource {
-      public  static String folderPath;
+    public static String folderPath;
-      static {
+
-          folderPath= Objects.requireNonNull(FileDataSource.class.getResource("/ml-100k")).getPath();
+    static {
-      }
+        folderPath = Objects.requireNonNull(FileDataSource.class.getResource("/ml-100k")).getPath();
    }
    /**
@@ -36,12 +37,13 @@ public class FileDataSource {
    public static List<RelateDTO> getData() {
        List<RelateDTO> relateList = Lists.newArrayList();
        try {
-            FileInputStream out = new FileInputStream(folderPath+"\\u.data");
+            FileInputStream out = new FileInputStream(folderPath + File.separator + "u.data");
            InputStreamReader reader = new InputStreamReader(out, StandardCharsets.UTF_8);
            BufferedReader in = new BufferedReader(reader);
            String line;
            while ((line = in.readLine()) != null) {
                String newline = line.replaceAll("[\t]", " ");
                // 196	242	3	881250949
                String[] ht = newline.split(" ");
                Integer userId = Integer.parseInt(ht[0]);
                Integer movieId = Integer.parseInt(ht[1]);
@@ -65,7 +67,7 @@ public class FileDataSource {
    public static List<UserDTO> getUserData() {
        List<UserDTO> userList = Lists.newArrayList();
        try {
-            FileInputStream out = new FileInputStream(folderPath+"\\u.user");
+            FileInputStream out = new FileInputStream(folderPath + File.separator + "u.user");
            InputStreamReader reader = new InputStreamReader(out, StandardCharsets.UTF_8);
            BufferedReader in = new BufferedReader(reader);
            String line;
@@ -97,7 +99,7 @@ public class FileDataSource {
    public static List<ItemDTO> getItemData() {
        List<ItemDTO> itemList = Lists.newArrayList();
        try {
-            FileInputStream out = new FileInputStream(folderPath+"\\u.item");
+            FileInputStream out = new FileInputStream(folderPath + File.separator + "u.item");
            InputStreamReader reader = new InputStreamReader(out, StandardCharsets.UTF_8);
            BufferedReader in = new BufferedReader(reader);
            String line;
--- a/src/main/java/com/tarzan/recommend/service/Recommend.java
+++ b/src/main/java/com/tarzan/recommend/service/Recommend.java
@@ -29,6 +29,7 @@ public class Recommend{
     */
    public static List<ItemDTO>  userCfRecommend(int userId){
        List<RelateDTO> data= FileDataSource.getData();
 //        System.out.println(data);
        List<Integer> recommendations = UserCF.recommend(userId, data);
        return FileDataSource.getItemData().stream().filter(e->recommendations.contains(e.getId())).collect(Collectors.toList());
    }
--- a/src/main/resources/ml-100k/readme.txt
+++ b/src/main/resources/ml-100k/readme.txt
@@ -0,0 +1,156 @@
 SUMMARY & USAGE LICENSE
 =============================================
 MovieLens data sets were collected by the GroupLens Research Project
 at the University of Minnesota.
 This data set consists of:
 	* 100,000 ratings (1-5) from 943 users on 1682 movies.
 	* Each user has rated at least 20 movies.
        * Simple demographic info for the users (age, gender, occupation, zip)
 The data was collected through the MovieLens web site
 (movielens.umn.edu) during the seven-month period from September 19th,
 1997 through April 22nd, 1998. This data has been cleaned up - users
 who had less than 20 ratings or did not have complete demographic
 information were removed from this data set. Detailed descriptions of
 the data file can be found at the end of this file.
 Neither the University of Minnesota nor any of the researchers
 involved can guarantee the correctness of the data, its suitability
 for any particular purpose, or the validity of results based on the
 use of the data set.  The data set may be used for any research
 purposes under the following conditions:
     * The user may not state or imply any endorsement from the
       University of Minnesota or the GroupLens Research Group.
     * The user must acknowledge the use of the data set in
       publications resulting from the use of the data set
       (see below for citation information).
     * The user may not redistribute the data without separate
       permission.
     * The user may not use this information for any commercial or
       revenue-bearing purposes without first obtaining permission
       from a faculty member of the GroupLens Research Project at the
       University of Minnesota.
 If you have any further questions or comments, please contact GroupLens
 <grouplens-info@cs.umn.edu>.
 CITATION
 ==============================================
 To acknowledge use of the dataset in publications, please cite the
 following paper:
 F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
 History and Context. ACM Transactions on Interactive Intelligent
 Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
 DOI=http://dx.doi.org/10.1145/2827872
 ACKNOWLEDGEMENTS
 ==============================================
 Thanks to Al Borchers for cleaning up this data and writing the
 accompanying scripts.
 PUBLISHED WORK THAT HAS USED THIS DATASET
 ==============================================
 Herlocker, J., Konstan, J., Borchers, A., Riedl, J.. An Algorithmic
 Framework for Performing Collaborative Filtering. Proceedings of the
 1999 Conference on Research and Development in Information
 Retrieval. Aug. 1999.
 FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
 ==============================================
 The GroupLens Research Project is a research group in the Department
 of Computer Science and Engineering at the University of Minnesota.
 Members of the GroupLens Research Project are involved in many
 research projects related to the fields of information filtering,
 collaborative filtering, and recommender systems. The project is lead
 by professors John Riedl and Joseph Konstan. The project began to
 explore automated collaborative filtering in 1992, but is most well
 known for its world wide trial of an automated collaborative filtering
 system for Usenet news in 1996.  The technology developed in the
 Usenet trial formed the base for the formation of Net Perceptions,
 Inc., which was founded by members of GroupLens Research. Since then
 the project has expanded its scope to research overall information
 filtering solutions, integrating in content-based methods as well as
 improving current collaborative filtering technology.
 Further information on the GroupLens Research project, including
 research publications, can be found at the following web site:
        http://www.grouplens.org/
 GroupLens Research currently operates a movie recommender based on
 collaborative filtering:
        http://www.movielens.org/
 DETAILED DESCRIPTIONS OF DATA FILES
 ==============================================
 Here are brief descriptions of the data.
 ml-data.tar.gz   -- Compressed tar file.  To rebuild the u data files do this:
                gunzip ml-data.tar.gz
                tar xvf ml-data.tar
                mku.sh
 u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.
              Each user has rated at least 20 movies.  Users and items are
              numbered consecutively from 1.  The data is randomly
              ordered. This is a tab separated list of
 	         user id | item id | rating | timestamp.
              The time stamps are unix seconds since 1/1/1970 UTC
 u.info     -- The number of users, items, and ratings in the u data set.
 u.item     -- Information about the items (movies); this is a tab separated
              list of
              movie id | movie title | release date | video release date |
              IMDb URL | unknown | Action | Adventure | Animation |
              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western |
              The last 19 fields are the genres, a 1 indicates the movie
              is of that genre, a 0 indicates it is not; movies can be in
              several genres at once.
              The movie ids are the ones used in the u.data data set.
 u.genre    -- A list of the genres.
 u.user     -- Demographic information about the users; this is a tab
              separated list of
              user id | age | gender | occupation | zip code
              The user ids are the ones used in the u.data data set.
 u.occupation -- A list of the occupations.
 u1.base    -- The data sets u1.base and u1.test through u5.base and u5.test
 u1.test       are 80%/20% splits of the u data into training and test data.
 u2.base       Each of u1, ..., u5 have disjoint test sets; this if for
 u2.test       5 fold cross validation (where you repeat your experiment
 u3.base       with each training and test set and average the results).
 u3.test       These data sets can be generated from u.data by mku.sh.
 u4.base
 u4.test
 u5.base
 u5.test
 ua.base    -- The data sets ua.base, ua.test, ub.base, and ub.test
 ua.test       split the u data into a training set and a test set with
 ub.base       exactly 10 ratings per user in the test set.  The sets
 ub.test       ua.test and ub.test are disjoint.  These data sets can
              be generated from u.data by mku.sh.
 allbut.pl  -- The script that generates training and test sets where
              all but n of a users ratings are in the training data.
 mku.sh     -- A shell script to generate all the u data sets from u.data.