数据集介绍

2024-09-03 14:18:41 +08:00
parent 976d60981f
commit 6fd3b67ed8
6 changed files with 181 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -6,15 +6,9 @@
 利用统计学的相关系数经常皮尔森（pearson）相关系数计算相关系数来实现千人千面的推荐系统。


-## 我的博客
-
-1. **JAVA OPC UA专栏**：[https://blog.csdn.net/weixin_40986713/category_12356608.html](https://blog.csdn.net/weixin_40986713/category_12356608.html)
-2. **AI绘画 | Stable diffusion**：[https://blog.csdn.net/weixin_40986713/category_12481790.html](https://blog.csdn.net/weixin_40986713/category_12481790.html) 
-3. **java高级技术专栏**：[https://blog.csdn.net/weixin_40986713/category_10796066.html](https://blog.csdn.net/weixin_40986713/category_10796066.html)
-4. **java Selenium自动化爬虫**：[https://blog.csdn.net/weixin_40986713/category_12165790.html](https://blog.csdn.net/weixin_40986713/category_12165790.html)
-5. **java 推荐算法专栏**：[https://blog.csdn.net/weixin_40986713/category_12268014.html](https://blog.csdn.net/weixin_40986713/category_12268014.html)
-6. **Java视频图像处理专栏**：[https://blog.csdn.net/weixin_40986713/category_11109931.html](https://blog.csdn.net/weixin_40986713/category_11109931.html) 
+## 数据集介绍

+https://grouplens.org/datasets/movielens/100k/

 #### 协同过滤算法
 协同过滤推荐算法是诞生最早，并且较为著名的推荐算法。主要的功能是预测和推荐。协同过滤(Collaborative Filtering,简写CF)是推荐系统最重要得思想之一，其思想是根据用户之前得喜好以及其他兴趣相近得用户得选择来给用户推荐物品(基于对用户历史行为数据的挖掘发现用户的喜好偏向，并预测用户可能喜好的产品进行推荐)，一般仅仅基于用户的行为数据（评价，购买，下载等），而不依赖于物品的任何附加信息（物品自身特征）或者用户的任何附加信息（年龄，性别等）。其思想总的来说就是：人以类聚，物以群分。
--- a/src/main/java/com/tarzan/recommend/core/ItemCF.java
+++ b/src/main/java/com/tarzan/recommend/core/ItemCF.java
@@ -19,18 +19,18 @@ public class ItemCF {
     * 方法描述: 推荐电影id列表
     *
     * @param itemId 当前电影id
-     * @param list 用户电影评分数据
+     * @param list   用户电影评分数据
     * @return {@link List<Integer>}
     * @date 2023年02月02日 14:51:42
     */
    public static List<Integer> recommend(Integer itemId, List<RelateDTO> list) {
        //按物品分组
-        Map<Integer, List<RelateDTO>>  itemMap=list.stream().collect(Collectors.groupingBy(RelateDTO::getItemId));
+        Map<Integer, List<RelateDTO>> itemMap = list.stream().collect(Collectors.groupingBy(RelateDTO::getItemId));
        //获取其他物品与当前物品的关系值
-        Map<Integer,Double>  itemDisMap = CoreMath.computeNeighbor(itemId, itemMap,1);
+        Map<Integer, Double> itemDisMap = CoreMath.computeNeighbor(itemId, itemMap, 1);
        //获取关系最近物品
-        double maxValue=Collections.max(itemDisMap.values());
-        return itemDisMap.entrySet().stream().filter(e->e.getValue()==maxValue).map(Map.Entry::getKey).collect(Collectors.toList());
+        double maxValue = Collections.max(itemDisMap.values());
+        return itemDisMap.entrySet().stream().filter(e -> e.getValue() == maxValue).map(Map.Entry::getKey).collect(Collectors.toList());
    }


--- a/src/main/java/com/tarzan/recommend/core/UserCF.java
+++ b/src/main/java/com/tarzan/recommend/core/UserCF.java
@@ -19,27 +19,27 @@ public class UserCF {
     * 方法描述: 推荐电影id列表
     *
     * @param userId 当前用户
-     * @param list 用户电影评分数据
+     * @param list   用户电影评分数据
     * @return {@link List<Integer>}
     * @date 2023年02月02日 14:51:42
     */
    public static List<Integer> recommend(Integer userId, List<RelateDTO> list) {
        //按用户分组
-        Map<Integer, List<RelateDTO>>  userMap=list.stream().collect(Collectors.groupingBy(RelateDTO::getUseId));
+        Map<Integer, List<RelateDTO>> userMap = list.stream().collect(Collectors.groupingBy(RelateDTO::getUseId));
        //获取其他用户与当前用户的关系值
-        Map<Integer,Double>  userDisMap = CoreMath.computeNeighbor(userId, userMap,0);
+        Map<Integer, Double> userDisMap = CoreMath.computeNeighbor(userId, userMap, 0);
        //获取关系最近的用户
-        double maxValue=Collections.max(userDisMap.values());
-        Set<Integer> userIds=userDisMap.entrySet().stream().filter(e->e.getValue()==maxValue).map(Map.Entry::getKey).collect(Collectors.toSet());
+        double maxValue = Collections.max(userDisMap.values());
+        Set<Integer> userIds = userDisMap.entrySet().stream().filter(e -> e.getValue() == maxValue).map(Map.Entry::getKey).collect(Collectors.toSet());
        //取关系最近的用户
        Integer nearestUserId = userIds.stream().findAny().orElse(null);
-        if(nearestUserId==null){
+        if (nearestUserId == null) {
            return Collections.emptyList();
        }
        //最近邻用户看过电影列表
-        List<Integer>  neighborItems = userMap.get(nearestUserId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
+        List<Integer> neighborItems = userMap.get(nearestUserId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
        //指定用户看过电影列表
-        List<Integer>  userItems  = userMap.get(userId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
+        List<Integer> userItems = userMap.get(userId).stream().map(RelateDTO::getItemId).collect(Collectors.toList());
        //找到最近邻看过，但是该用户没看过的电影
        neighborItems.removeAll(userItems);
        return neighborItems;
--- a/src/main/java/com/tarzan/recommend/service/FileDataSource.java
+++ b/src/main/java/com/tarzan/recommend/service/FileDataSource.java
@@ -20,10 +20,11 @@ import java.util.Objects;
 public class FileDataSource {


-      public  static String folderPath;
-      static {
-          folderPath= Objects.requireNonNull(FileDataSource.class.getResource("/ml-100k")).getPath();
-      }
+    public static String folderPath;
+
+    static {
+        folderPath = Objects.requireNonNull(FileDataSource.class.getResource("/ml-100k")).getPath();
+    }


    /**
@@ -36,12 +37,13 @@ public class FileDataSource {
    public static List<RelateDTO> getData() {
        List<RelateDTO> relateList = Lists.newArrayList();
        try {
-            FileInputStream out = new FileInputStream(folderPath+"\\u.data");
+            FileInputStream out = new FileInputStream(folderPath + File.separator + "u.data");
            InputStreamReader reader = new InputStreamReader(out, StandardCharsets.UTF_8);
            BufferedReader in = new BufferedReader(reader);
            String line;
            while ((line = in.readLine()) != null) {
                String newline = line.replaceAll("[\t]", " ");
+                // 196	242	3	881250949
                String[] ht = newline.split(" ");
                Integer userId = Integer.parseInt(ht[0]);
                Integer movieId = Integer.parseInt(ht[1]);
@@ -65,7 +67,7 @@ public class FileDataSource {
    public static List<UserDTO> getUserData() {
        List<UserDTO> userList = Lists.newArrayList();
        try {
-            FileInputStream out = new FileInputStream(folderPath+"\\u.user");
+            FileInputStream out = new FileInputStream(folderPath + File.separator + "u.user");
            InputStreamReader reader = new InputStreamReader(out, StandardCharsets.UTF_8);
            BufferedReader in = new BufferedReader(reader);
            String line;
@@ -97,7 +99,7 @@ public class FileDataSource {
    public static List<ItemDTO> getItemData() {
        List<ItemDTO> itemList = Lists.newArrayList();
        try {
-            FileInputStream out = new FileInputStream(folderPath+"\\u.item");
+            FileInputStream out = new FileInputStream(folderPath + File.separator + "u.item");
            InputStreamReader reader = new InputStreamReader(out, StandardCharsets.UTF_8);
            BufferedReader in = new BufferedReader(reader);
            String line;
--- a/src/main/java/com/tarzan/recommend/service/Recommend.java
+++ b/src/main/java/com/tarzan/recommend/service/Recommend.java
@@ -29,6 +29,7 @@ public class Recommend{
     */
    public static List<ItemDTO>  userCfRecommend(int userId){
        List<RelateDTO> data= FileDataSource.getData();
+//        System.out.println(data);
        List<Integer> recommendations = UserCF.recommend(userId, data);
        return FileDataSource.getItemData().stream().filter(e->recommendations.contains(e.getId())).collect(Collectors.toList());
    }
--- a/src/main/resources/ml-100k/readme.txt
+++ b/src/main/resources/ml-100k/readme.txt
@@ -0,0 +1,156 @@
+SUMMARY & USAGE LICENSE
+=============================================
+
+MovieLens data sets were collected by the GroupLens Research Project
+at the University of Minnesota.
+
+This data set consists of:
+	* 100,000 ratings (1-5) from 943 users on 1682 movies.
+	* Each user has rated at least 20 movies.
+        * Simple demographic info for the users (age, gender, occupation, zip)
+
+The data was collected through the MovieLens web site
+(movielens.umn.edu) during the seven-month period from September 19th,
+1997 through April 22nd, 1998. This data has been cleaned up - users
+who had less than 20 ratings or did not have complete demographic
+information were removed from this data set. Detailed descriptions of
+the data file can be found at the end of this file.
+
+Neither the University of Minnesota nor any of the researchers
+involved can guarantee the correctness of the data, its suitability
+for any particular purpose, or the validity of results based on the
+use of the data set.  The data set may be used for any research
+purposes under the following conditions:
+
+     * The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.
+
+     * The user must acknowledge the use of the data set in
+       publications resulting from the use of the data set
+       (see below for citation information).
+
+     * The user may not redistribute the data without separate
+       permission.
+
+     * The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.
+
+If you have any further questions or comments, please contact GroupLens
+<grouplens-info@cs.umn.edu>.
+
+CITATION
+==============================================
+
+To acknowledge use of the dataset in publications, please cite the
+following paper:
+
+F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
+History and Context. ACM Transactions on Interactive Intelligent
+Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
+DOI=http://dx.doi.org/10.1145/2827872
+
+ACKNOWLEDGEMENTS
+==============================================
+
+Thanks to Al Borchers for cleaning up this data and writing the
+accompanying scripts.
+
+PUBLISHED WORK THAT HAS USED THIS DATASET
+==============================================
+
+Herlocker, J., Konstan, J., Borchers, A., Riedl, J.. An Algorithmic
+Framework for Performing Collaborative Filtering. Proceedings of the
+1999 Conference on Research and Development in Information
+Retrieval. Aug. 1999.
+
+FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
+==============================================
+
+The GroupLens Research Project is a research group in the Department
+of Computer Science and Engineering at the University of Minnesota.
+Members of the GroupLens Research Project are involved in many
+research projects related to the fields of information filtering,
+collaborative filtering, and recommender systems. The project is lead
+by professors John Riedl and Joseph Konstan. The project began to
+explore automated collaborative filtering in 1992, but is most well
+known for its world wide trial of an automated collaborative filtering
+system for Usenet news in 1996.  The technology developed in the
+Usenet trial formed the base for the formation of Net Perceptions,
+Inc., which was founded by members of GroupLens Research. Since then
+the project has expanded its scope to research overall information
+filtering solutions, integrating in content-based methods as well as
+improving current collaborative filtering technology.
+
+Further information on the GroupLens Research project, including
+research publications, can be found at the following web site:
+
+        http://www.grouplens.org/
+
+GroupLens Research currently operates a movie recommender based on
+collaborative filtering:
+
+        http://www.movielens.org/
+
+DETAILED DESCRIPTIONS OF DATA FILES
+==============================================
+
+Here are brief descriptions of the data.
+
+ml-data.tar.gz   -- Compressed tar file.  To rebuild the u data files do this:
+                gunzip ml-data.tar.gz
+                tar xvf ml-data.tar
+                mku.sh
+
+u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.
+              Each user has rated at least 20 movies.  Users and items are
+              numbered consecutively from 1.  The data is randomly
+              ordered. This is a tab separated list of
+	         user id | item id | rating | timestamp.
+              The time stamps are unix seconds since 1/1/1970 UTC
+
+u.info     -- The number of users, items, and ratings in the u data set.
+
+u.item     -- Information about the items (movies); this is a tab separated
+              list of
+              movie id | movie title | release date | video release date |
+              IMDb URL | unknown | Action | Adventure | Animation |
+              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
+              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
+              Thriller | War | Western |
+              The last 19 fields are the genres, a 1 indicates the movie
+              is of that genre, a 0 indicates it is not; movies can be in
+              several genres at once.
+              The movie ids are the ones used in the u.data data set.
+
+u.genre    -- A list of the genres.
+
+u.user     -- Demographic information about the users; this is a tab
+              separated list of
+              user id | age | gender | occupation | zip code
+              The user ids are the ones used in the u.data data set.
+
+u.occupation -- A list of the occupations.
+
+u1.base    -- The data sets u1.base and u1.test through u5.base and u5.test
+u1.test       are 80%/20% splits of the u data into training and test data.
+u2.base       Each of u1, ..., u5 have disjoint test sets; this if for
+u2.test       5 fold cross validation (where you repeat your experiment
+u3.base       with each training and test set and average the results).
+u3.test       These data sets can be generated from u.data by mku.sh.
+u4.base
+u4.test
+u5.base
+u5.test
+
+ua.base    -- The data sets ua.base, ua.test, ub.base, and ub.test
+ua.test       split the u data into a training set and a test set with
+ub.base       exactly 10 ratings per user in the test set.  The sets
+ub.test       ua.test and ub.test are disjoint.  These data sets can
+              be generated from u.data by mku.sh.
+
+allbut.pl  -- The script that generates training and test sets where
+              all but n of a users ratings are in the training data.
+
+mku.sh     -- A shell script to generate all the u data sets from u.data.