From ab35ac8b769b2d9816dffb33a64f2c6f7bd5dd6e Mon Sep 17 00:00:00 2001 From: admin <weikou2014> Date: 星期四, 05 九月 2024 17:05:55 +0800 Subject: [PATCH] 风行网页版爬虫 --- src/main/java/com/yeshi/buwan/util/video/web/TencentWebUtil.java | 170 +++++++++++++++++++++++++++++++++----------------------- 1 files changed, 100 insertions(+), 70 deletions(-) diff --git a/src/main/java/com/yeshi/buwan/util/video/web/TencentWebUtil.java b/src/main/java/com/yeshi/buwan/util/video/web/TencentWebUtil.java index 1f7f87f..865ae67 100644 --- a/src/main/java/com/yeshi/buwan/util/video/web/TencentWebUtil.java +++ b/src/main/java/com/yeshi/buwan/util/video/web/TencentWebUtil.java @@ -1,10 +1,6 @@ package com.yeshi.buwan.util.video.web; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.yeshi.utils.HttpUtil; +import net.sf.json.JSONObject; import org.yeshi.utils.StringUtil; import java.io.UnsupportedEncodingException; @@ -13,12 +9,29 @@ public class TencentWebUtil { + final public static Map<String,Integer> CHANNEL_ID_MAP=new HashMap<>(); + static{ + CHANNEL_ID_MAP.put("鐢靛奖",100173); + CHANNEL_ID_MAP.put("鐢佃鍓�",100113); + CHANNEL_ID_MAP.put("鍔ㄦ极",100119); + } + public static class TencentWebVideoInfo { private String playUrl; private String id; private String title; private String picture; private String duration; + private String tag; + private String epsodePubtime; + + public String getEpsodePubtime() { + return epsodePubtime; + } + + public void setEpsodePubtime(String epsodePubtime) { + this.epsodePubtime = epsodePubtime; + } public String getPlayUrl() { return playUrl; @@ -59,6 +72,14 @@ public void setDuration(String duration) { this.duration = duration; } + + public String getTag() { + return tag; + } + + public void setTag(String tag) { + this.tag = tag; + } } @@ -83,75 +104,57 @@ /** * 鑾峰彇鐭棰戝垪琛� * - * @param params - * @param page + * @param channelId 100173:鐢靛奖 100113:鐢佃鍓� 100119:鍔ㄦ极 + * @param pageIndex * @return * @throws Exception */ - public static List<TencentWebVideoInfo> getVideoList(Map<String, String> params, int page) throws Exception { - if (params == null) - throw new Exception("鍙傛暟涓虹┖"); - int pageSize = 30; - params.put("append", "1"); - params.put("listpage", page + ""); - params.put("offset", (page - 1) * pageSize + ""); - params.put("pagesize", pageSize + ""); - - String url = "https://v.qq.com/x/bu/pagesheet/list"; + public static List<TencentWebVideoInfo> getVideoList(int channelId, int pageIndex, Integer areaId) throws Exception { + List<TencentWebVideoInfo> videoList=new ArrayList<>(); Map<String, String> headers = new HashMap<>(); - headers.put("referer", "https://v.qq.com/channel/ent"); - headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"); - String result = HttpUtil.get(url, params, headers); - Document document = Jsoup.parse(result); - Elements els = document.getElementsByClass("list_item"); - return parseVideoList(els); - } - - - public static List<TencentWebVideoInfo> getVideoList(String url) throws Exception { - Map<String, String> headers = new HashMap<>(); - headers.put("referer", "https://v.qq.com/channel/ent"); - headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"); - String result = HttpUtil.get(url, new HashMap<>(), headers); - Document document = Jsoup.parse(result); - Elements els = document.getElementsByClass("list_item"); - return parseVideoList(els); - } - - private static List<TencentWebVideoInfo> parseVideoList(Elements els) throws UnsupportedEncodingException { - List<TencentWebVideoInfo> list = new ArrayList<>(); - for (int i = 0; i < els.size(); i++) { - Element ele = els.get(i); - String href = ele.getElementsByTag("a").get(0).attr("href"); - String id = ele.getElementsByTag("a").get(0).attr("data-float"); - String title = ele.getElementsByTag("a").get(0).attr("title"); - title = new String(title.getBytes("ISO-8859-1"), "UTF-8"); - String picture = ele.getElementsByTag("img").get(0).attr("src"); - picture = picture.startsWith("http") ? picture : "https:" + picture; - String duration = null; - try { - duration = ele.getElementsByClass("figure_caption").get(0).ownText(); - duration = duration.trim(); - for (int j = 0; j < duration.length(); j++) { - char ca = duration.charAt(j); - if (!(ca >= 48 && ca < 59)) { - duration = null; - break; - } - } - System.out.println(duration); - } catch (Exception e) { - } - TencentWebVideoInfo videoInfo = new TencentWebVideoInfo(); - videoInfo.setDuration(duration); - videoInfo.setId(id); - videoInfo.setPicture(picture); - videoInfo.setPlayUrl(href); - videoInfo.setTitle(title); - list.add(videoInfo); + headers.put("Content-Type", "application/json; charset=utf-8"); + headers.put("Referer", "https://v.qq.com/"); + String text = + "{\"page_context\":{\"page_index\":\"1\"},\"page_params\":{\"page_id\":\"channel_list_second_page\",\"page_type\":\"operation\",\"channel_id\":\"100173\",\"filter_params\":\"sort=75\",\"page\":\"1\",\"new_mark_label_enabled\":\"1\"},\"page_bypass_params\":{\"params\":{\"page_id\":\"channel_list_second_page\",\"page_type\":\"operation\",\"channel_id\":\"100173\",\"filter_params\":\"sort=75\",\"page\":\"1\",\"caller_id\":\"3000010\",\"platform_id\":\"2\",\"data_mode\":\"default\",\"user_mode\":\"default\"},\"scene\":\"operation\",\"abtest_bypass_id\":\"77fef11ab0ccd4ee\"}}"; + JSONObject params=JSONObject.fromObject(text); + params.optJSONObject("page_context").put("page_index",pageIndex+""); + params.optJSONObject("page_params").put("channel_id",channelId+""); + params.optJSONObject("page_params").put("page",pageIndex+""); + if(areaId!=null) { + params.optJSONObject("page_params").put("filter_params", params.optJSONObject("page_params").get("filter_params") + "&iarea=" + areaId); } - return list; + params.optJSONObject("page_bypass_params").optJSONObject("params").put("page",pageIndex+""); + params.optJSONObject("page_bypass_params").optJSONObject("params").put("channel_id",channelId+""); + + String result = com.yeshi.buwan.util.HttpUtil.post("https://pbaccess.video.qq.com/trpc.vector_layout.page_view.PageService/getPage?video_appid=3000010", params.toString(), headers); + + + com.alibaba.fastjson.JSONObject resultJson = com.alibaba.fastjson.JSONObject.parseObject(result); + if (resultJson.getInteger("ret") == 0) { + + com.alibaba.fastjson.JSONObject data = resultJson.getJSONObject("data"); + com.alibaba.fastjson.JSONArray array = data.getJSONArray("CardList"); + for (int i = 0; i < array.size(); i++) { + if (array.getJSONObject(i).getString("type").equalsIgnoreCase("channel_list_poster")) { + array = array.getJSONObject(i).getJSONObject("children_list").getJSONObject("list").getJSONArray("cards"); + for (int j = 0; j < array.size(); j++) { + if (array.getJSONObject(j).getString("type").equalsIgnoreCase("channel_list_poster")) { + com.alibaba.fastjson.JSONObject item = array.getJSONObject(j).getJSONObject("params"); + TencentWebVideoInfo video = parseListItem(item); + videoList.add(video); + } + } + break; + } + + } + + } + return videoList; } + + + public static String getApiUrl(String webUrl, int page) { Map<String, String> params = parseParams(webUrl); @@ -174,9 +177,36 @@ return url + "?" + StringUtil.concat(paramsList, "&"); } + private static TencentWebVideoInfo parseListItem(com.alibaba.fastjson.JSONObject item) { + TencentWebVideoInfo videoInfo = new TencentWebVideoInfo(); + videoInfo.setId(item.getString("cid")); + videoInfo.setPicture(item.getString("new_pic_vt")); + videoInfo.setPlayUrl(String.format("https://v.qq.com/x/cover/%s.html", videoInfo.getId())); + videoInfo.setTitle(item.getString("title")); + videoInfo.setDuration(""); + if (item.getInteger("type") != 1) { + videoInfo.setTag(item.getString("timelong")); + } else { + JSONObject imgTag = JSONObject.fromObject(item.getString("uni_imgtag")); + for (Object key : imgTag.keySet()) { + JSONObject imgTagItem = imgTag.optJSONObject(key.toString()); + if (imgTagItem.optInt("id") == 28) { + videoInfo.setTag(imgTagItem.optString("text")); + } + } + if(videoInfo.getTag()==null){ + videoInfo.setTag("9.0鍒�"); + } + } + videoInfo.setEpsodePubtime(item.getString("epsode_pubtime")); + return videoInfo; + } + public static void main(String[] args) throws Exception { - List<TencentWebVideoInfo> videoInfos = getVideoList(parseParams("https://v.qq.com/channel/ent?_all=1&channel=ent&iarea=2&itype=-1&listpage=1&sort=40"), 1); - System.out.println(videoInfos); + + List<TencentWebVideoInfo> list = getVideoList(100173,0, 100028); + System.out.println(list.size()); + } } -- Gitblit v1.8.0