From ab35ac8b769b2d9816dffb33a64f2c6f7bd5dd6e Mon Sep 17 00:00:00 2001
From: admin <weikou2014>
Date: 星期四, 05 九月 2024 17:05:55 +0800
Subject: [PATCH] 风行网页版爬虫

---
 src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java |  202 +++++++++++++++++++++++++++++++++++---------------
 1 files changed, 141 insertions(+), 61 deletions(-)

diff --git a/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java b/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java
index 71b6ddd..6637d93 100644
--- a/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java
+++ b/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java
@@ -2,25 +2,28 @@
 
 import com.google.gson.Gson;
 import com.google.gson.reflect.TypeToken;
-import com.yeshi.buwan.videos.tencent.entity.TencentCoverInfo;
+import com.yeshi.buwan.util.StringUtil;
 import com.yeshi.buwan.util.video.web.TencentWebUtil;
+import com.yeshi.buwan.videos.tencent.vo.TencentCoverInfoVO;
+import net.sf.json.JSONArray;
 import net.sf.json.JSONObject;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import javax.script.Invocable;
 import javax.script.ScriptEngine;
 import javax.script.ScriptEngineManager;
 import javax.script.ScriptException;
 import java.lang.reflect.Type;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
 
 public class TencentVideoApiUtil {
 
     static ScriptEngine jsEngine = null;
+    static Logger loggerDebug = LoggerFactory.getLogger("debug");
 
 //    static {
 //        if (jdGoodsJs == null)
@@ -68,12 +71,67 @@
         return directors;
     }
 
-    public static TencentCoverInfo getCoverInfo(String url) throws Exception {
+    private static List<TencentCoverInfoVO.EpisodeVO> getPageData(String cid, String page_context) {
+        List<TencentCoverInfoVO.EpisodeVO> voList = new ArrayList<>();
+        Map<String, String> headers = new HashMap<>();
+        headers.put("Content-Type", "application/json; charset=utf-8");
+        headers.put("Referer", "https://v.qq.com/");
+        String text =
+                "{\"page_params\":{\"req_from\":\"web_vsite\",\"page_id\":\"vsite_episode_list\",\"page_type\":\"detail_operation\",\"id_type\":\"1\",\"page_size\":\"\",\"cid\":\"mzc002007j7p5hn\",\"vid\":\"\",\"lid\":\"\",\"page_num\":\"\",\"page_context\":\"chapter_name=&cid=mzc002007j7p5hn&detail_page_type=1&episode_begin=31&episode_end=60&episode_step=30&filter_rule_id=&id_type=1&is_nocopyright=false&is_skp_style=false&lid=&list_page_context=&need_tab=1&order=&page_num=1&page_size=30&req_from=web_vsite&req_from_second_type=&req_type=0&siteName=&tab_type=1&title_style=&ui_type=null&un_strategy_id=13dc6f30819942eb805250fb671fb082&watch_together_pay_status=0&year=\",\"detail_page_type\":\"1\"},\"has_cache\":1}";
+        JSONObject params = JSONObject.fromObject(text);
+        params.optJSONObject("page_params").put("cid", cid + "");
+        params.optJSONObject("page_params").put("page_context", page_context + "");
+        String result = com.yeshi.buwan.util.HttpUtil.post("https://pbaccess.video.qq.com/trpc.universal_backend_service.page_server_rpc.PageServer/GetPageData?video_appid=3000010&vplatform=2&vversion_name=8.2.96", params.toString(), headers);
+        System.out.println(result);
+        JSONObject root = JSONObject.fromObject(result);
+        JSONArray items = root.optJSONObject("data").optJSONArray("module_list_datas").optJSONObject(0).optJSONArray("module_datas").optJSONObject(0).optJSONObject("item_data_lists").optJSONArray("item_datas");
+        for (int i = 0; i < items.size(); i++) {
+            JSONObject item = items.optJSONObject(i);
+            if(item.optInt("item_type")!=1){
+              continue;
+            }
+
+            JSONObject item_params = item.optJSONObject("item_params");
+            TencentCoverInfoVO.EpisodeVO vo = new TencentCoverInfoVO.EpisodeVO();
+            vo.setCid(item_params.optString("cid"));
+            vo.setDuration(item_params.optInt("duration"));
+            vo.setEpIndex(item_params.optInt("title") - 1);
+            vo.setFullTitle(item_params.optString("union_title"));
+            vo.setIndex(item_params.optInt("title") - 1);
+            vo.setIsNoStoreWatchHistory(item_params.optInt("is_no_store_watch_history") > 0);
+            vo.setItemType(item.optString("item_type"));
+            vo.setPic(item_params.optString("image_url"));
+            vo.setPlayTitle(item_params.optString("play_title"));
+            vo.setTitle(item_params.optString("title"));
+            vo.setVid(item_params.optString("vid"));
+            if(!vo.isIsNoStoreWatchHistory()){
+                voList.add(vo);
+            }
+
+        }
+        return voList;
+
+    }
+
+    /**
+     * @author hxh 
+     * @description 鏄惁鍦ㄧ嚎
+     * @date 17:39 2024/8/16
+     * @param: url
+     * @return boolean
+     **/
+    public static boolean isOnLine(String url) throws Exception{
+        Document doc = Jsoup.connect(url).timeout(10000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36").get();
+        return    doc.getElementsByClass("page_404").size()==0;
+
+    }
+
+    public static TencentCoverInfoVO getCoverInfo(String url) throws Exception {
         String script = null;
         Document doc = Jsoup.connect(url).timeout(10000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36").get();
         Elements els = doc.getElementsByTag("script");
         for (int i = 0; i < els.size(); i++) {
-            if (els.get(i).html().indexOf("var COVER_INFO") > -1) {
+            if (els.get(i).html().indexOf("window.__PINIA__") > -1) {
                 script = els.get(i).html();
                 break;
             }
@@ -81,7 +139,7 @@
 
         script = script.replace("window.", "_window.");
         script = " var _window={};" + "var document={getElementsByTagName:function(tag){return [''];}};" + script;
-        script += "\n function getCoverInfo(){return COVER_INFO }; function getVideoInfo(){return VIDEO_INFO}";
+        script += "\n function getCoverInfo(){return _window.__PINIA__.global.coverInfo;}; function getVideoInfo(){return _window.__PINIA__.episodeMain.listData;}";//episodeMain.listData[0].list[0];
 
         ScriptEngineManager manager = new ScriptEngineManager();
         jsEngine = manager.getEngineByName("javascript");
@@ -97,65 +155,102 @@
             Object coverInfo = in.invokeFunction("getCoverInfo");
             JSONObject root = JSONObject.fromObject(gson.toJson(coverInfo));
             System.out.println(root.toString());
-            List<TencentCoverInfo.VipIdsBean> vipIdsBeans = new ArrayList<>();
+            List<TencentCoverInfoVO.EpisodeVO> vipIdsBeans = new ArrayList<>();
 
-            if (root.optJSONObject("vip_ids").isArray()) {
-                Type type = new TypeToken<List<TencentCoverInfo.VipIdsBean>>() {
-                }.getType();
-                vipIdsBeans = gson.fromJson(root.optJSONArray("vip_ids").toString(), type);
+            Object videoInfo = in.invokeFunction("getVideoInfo");
+            Object tabs = null;
+            if (videoInfo instanceof Map) {
+                Map videoInfoMap = (Map) videoInfo;
+                String key = videoInfoMap.keySet().iterator().next().toString();
+                videoInfoMap = (Map) videoInfoMap.get(key);
+                tabs = videoInfoMap.get("tabs");
+                videoInfoMap = (Map) videoInfoMap.get("list");
+
+
+                key = videoInfoMap.keySet().iterator().next().toString();
+                videoInfo = videoInfoMap.get(key);
+            } else if (videoInfo instanceof List) {
+                List tempList = (List) videoInfo;
+                tabs = ((Map) tempList.get(0)).get("tabs");
+                tempList = (List) (((Map) tempList.get(0)).get("list"));
+                videoInfo = tempList.get(0);
+            }
+            tabs = toList(tabs);
+            TencentCoverInfoVO coverInfoBean = gson.fromJson(root.toString(), TencentCoverInfoVO.class);
+            if (tabs != null&&((List) tabs).size()>0) {
+                // 闇�瑕佽幏鍙栭泦
+                JSONArray tabArrays = JSONArray.fromObject(gson.toJson(tabs));
+                for (int i = 0; i < tabArrays.size(); i++) {
+                    String pageContext = tabArrays.optJSONObject(i).optString("pageContext");
+                    List<TencentCoverInfoVO.EpisodeVO> epList = getPageData(coverInfoBean.getCover_id(), pageContext);
+                    if (epList != null) {
+                        vipIdsBeans.addAll(epList);
+                    }
+                }
             } else {
-                JSONObject vipIds = root.optJSONObject("vip_ids");
-                for (Iterator<String> its = vipIds.keys(); its.hasNext(); ) {
-                    String p = its.next();
-                    TencentCoverInfo.VipIdsBean idsBean = gson.fromJson(vipIds.optJSONObject(p).toString(), TencentCoverInfo.VipIdsBean.class);
-                    idsBean.setP(Integer.parseInt(p));
-                    vipIdsBeans.add(idsBean);
+                System.out.printf(gson.toJson(videoInfo));
+                JSONObject episodeObj = JSONObject.fromObject(gson.toJson(videoInfo));
+                for (Object k : episodeObj.keySet()) {
+                    JSONObject item = episodeObj.optJSONObject(k.toString());
+                    if (item.optBoolean("isNoStoreWatchHistory")) {
+                        continue;
+                    }
+                    vipIdsBeans.add(gson.fromJson(item.toString(), TencentCoverInfoVO.EpisodeVO.class));
                 }
             }
-            root.remove("vip_ids");
 
             try {
-                List<String> subTypes = objToArray(root, "subtype");
-                List<String> directors = objToArray(root, "director");
-                List<String> leadingActor = objToArray(root, "leading_actor");
-                List<String> subGenre = objToArray(root, "sub_genre");
 
-
-                TencentCoverInfo coverInfoBean = gson.fromJson(root.toString(), TencentCoverInfo.class);
-                coverInfoBean.setVip_ids(vipIdsBeans);
-                coverInfoBean.setSubtype(subTypes);
-                coverInfoBean.setDirector(directors);
-                coverInfoBean.setLeading_actor(leadingActor);
-                coverInfoBean.setSub_genre(subGenre);
+                coverInfoBean.setEpisodes(vipIdsBeans);
                 return coverInfoBean;
             } catch (Exception e) {
                 e.printStackTrace();
             }
-//            System.out.println(coverInfoBean);
         }
         return null;
     }
 
 
-    public static List<TencentCoverInfo> getVideoList(String url) throws Exception {
-        List<TencentCoverInfo> coverInfoList = new ArrayList<>();
-        List<TencentWebUtil.TencentWebVideoInfo> list = TencentWebUtil.getVideoList(url);
+    public static List<TencentCoverInfoVO> getVideoList(int channelId, int pageIndex, Integer areaId) throws Exception {
+        List<TencentCoverInfoVO> coverInfoList = new ArrayList<>();
+        List<TencentWebUtil.TencentWebVideoInfo> list = TencentWebUtil.getVideoList(channelId, pageIndex,areaId);
+        loggerDebug.info("鑵捐瑙嗛锛氳幏鍙栬棰戝垪琛�-{}:{}",channelId,list.size());
         for (TencentWebUtil.TencentWebVideoInfo info : list) {
             try {
-                TencentCoverInfo coverInfo = getCoverInfo(info.getPlayUrl());
-                coverInfo.setVertical_pic_url(info.getPicture());
+                TencentCoverInfoVO coverInfo = getCoverInfo(info.getPlayUrl());
+                if(StringUtil.isNullOrEmpty(coverInfo.getPublish_date())){
+                    coverInfo.setPublish_date(info.getEpsodePubtime().substring(0,10));
+                }
+                coverInfo.setEpisode_update(info.getTag());
+                coverInfo.setNew_pic_vt(info.getPicture());
                 coverInfoList.add(coverInfo);
             } catch (Exception e) {
                 e.printStackTrace();
+                loggerDebug.error("鑵捐瑙嗛锛氳幏鍙栬鎯呭嚭閿�-"+info.getTitle(),e);
+            }finally {
+                Thread.sleep(2000);
             }
         }
         return coverInfoList;
     }
 
-    public static List<TencentCoverInfo> getVideoListByCategory(String channel, int page) {
-        String url = String.format("https://v.qq.com/x/bu/pagesheet/list?_all=1&append=1&channel=%s&ipay=2&listpage=%s&offset=%s&pagesize=30&sort=18", channel, page, (page - 1) * 30);
+    private static List toList(Object data){
+        if(data instanceof  Map){
+            List list=new ArrayList();
+            Map map=(Map) data;
+            for(Object key:map.keySet()){
+                list.add(map.get(key));
+            }
+            return list;
+        }else if(data instanceof  List){
+            return (List) data;
+        }
+        return null;
+    }
+
+    public static List<TencentCoverInfoVO> getVideoListByCategory(String channel, int page,Integer areaId) {
         try {
-            return getVideoList(url);
+            return getVideoList(TencentWebUtil.CHANNEL_ID_MAP.get(channel), page - 1, areaId);
         } catch (Exception e) {
             e.printStackTrace();
         }
@@ -164,29 +259,14 @@
 
 
     public static void main(String[] args) throws Exception {
-        List<TencentCoverInfo> list1 = getVideoList(TencentWebUtil.getApiUrl("https://v.qq.com/channel/tv?_all=1&channel=tv&iarea=818&listpage=1&sort=18", 1));
-        System.out.println(list1);
-
-        if (1 > 0)
-            return;
-
-//        try {
-        TencentCoverInfo info = getCoverInfo("https://v.qq.com/x/cover/7q544xyrava3vxf.html");
-        System.out.println(info);
-//
-//            info = getCoverInfo("https://v.qq.com/x/cover/mzc00200s0ii272/r0036xibgw2.html");
-//            info = getCoverInfo("https://v.qq.com/x/cover/ylgl3m6wo0sypou/w0036x9c5c7.html");
-//            info = getCoverInfo("https://v.qq.com/x/cover/ylgl3m6wo0sypou/w0036x9c5c7.html");
-//            //
-//            info = getCoverInfo("https://v.qq.com/x/cover/mzc00200tlv15ub.html");
-//            System.out.println(info);
-//        } catch (Exception e) {
-//            e.printStackTrace();
-
+//        getPageData("mzc002007j7p5hn", "chapter_name=&cid=mzc002007j7p5hn&detail_page_type=1&episode_begin=1&episode_end=30&episode_step=30&filter_rule_id=&id_type=1&is_nocopyright=false&is_skp_style=false&lid=&list_page_context=&need_tab=1&order=&page_num=0&page_size=30&req_from=web_vsite&req_from_second_type=&req_type=0&siteName=&tab_type=1&title_style=&ui_type=null&un_strategy_id=13dc6f30819942eb805250fb671fb082&watch_together_pay_status=0&year=");
+        List<TencentCoverInfoVO>  voList =  getVideoListByCategory("鐢佃鍓�",1, null);
+//        for(TencentCoverInfoVO vo:voList){
+//            System.out.println(vo.getTitle());
 //        }
-        int page = 1;
-        List<TencentCoverInfo> list = getVideoListByCategory("cartoon", 1);
-        System.out.println(list);
+//        System.out.printf(voList.toString());
+//        getCoverInfo("https://v.qq.com/x/cover/mzc002007j7p5hn/z0047b7g57k.html");
+        getCoverInfo("https://v.qq.com/x/cover/mcv8hkc8zk8lnov/r0048jp1a5e.html");
     }
 
 

--
Gitblit v1.8.0