From ab35ac8b769b2d9816dffb33a64f2c6f7bd5dd6e Mon Sep 17 00:00:00 2001 From: admin <weikou2014> Date: 星期四, 05 九月 2024 17:05:55 +0800 Subject: [PATCH] 风行网页版爬虫 --- src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java | 202 +++++++++++++++++++++++++++++++++++--------------- 1 files changed, 141 insertions(+), 61 deletions(-) diff --git a/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java b/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java index 71b6ddd..6637d93 100644 --- a/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java +++ b/src/main/java/com/yeshi/buwan/videos/tencent/TencentVideoApiUtil.java @@ -2,25 +2,28 @@ import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; -import com.yeshi.buwan.videos.tencent.entity.TencentCoverInfo; +import com.yeshi.buwan.util.StringUtil; import com.yeshi.buwan.util.video.web.TencentWebUtil; +import com.yeshi.buwan.videos.tencent.vo.TencentCoverInfoVO; +import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.script.Invocable; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; import javax.script.ScriptException; import java.lang.reflect.Type; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import java.util.*; public class TencentVideoApiUtil { static ScriptEngine jsEngine = null; + static Logger loggerDebug = LoggerFactory.getLogger("debug"); // static { // if (jdGoodsJs == null) @@ -68,12 +71,67 @@ return directors; } - public static TencentCoverInfo getCoverInfo(String url) throws Exception { + private static List<TencentCoverInfoVO.EpisodeVO> getPageData(String cid, String page_context) { + List<TencentCoverInfoVO.EpisodeVO> voList = new ArrayList<>(); + Map<String, String> headers = new HashMap<>(); + headers.put("Content-Type", "application/json; charset=utf-8"); + headers.put("Referer", "https://v.qq.com/"); + String text = + "{\"page_params\":{\"req_from\":\"web_vsite\",\"page_id\":\"vsite_episode_list\",\"page_type\":\"detail_operation\",\"id_type\":\"1\",\"page_size\":\"\",\"cid\":\"mzc002007j7p5hn\",\"vid\":\"\",\"lid\":\"\",\"page_num\":\"\",\"page_context\":\"chapter_name=&cid=mzc002007j7p5hn&detail_page_type=1&episode_begin=31&episode_end=60&episode_step=30&filter_rule_id=&id_type=1&is_nocopyright=false&is_skp_style=false&lid=&list_page_context=&need_tab=1&order=&page_num=1&page_size=30&req_from=web_vsite&req_from_second_type=&req_type=0&siteName=&tab_type=1&title_style=&ui_type=null&un_strategy_id=13dc6f30819942eb805250fb671fb082&watch_together_pay_status=0&year=\",\"detail_page_type\":\"1\"},\"has_cache\":1}"; + JSONObject params = JSONObject.fromObject(text); + params.optJSONObject("page_params").put("cid", cid + ""); + params.optJSONObject("page_params").put("page_context", page_context + ""); + String result = com.yeshi.buwan.util.HttpUtil.post("https://pbaccess.video.qq.com/trpc.universal_backend_service.page_server_rpc.PageServer/GetPageData?video_appid=3000010&vplatform=2&vversion_name=8.2.96", params.toString(), headers); + System.out.println(result); + JSONObject root = JSONObject.fromObject(result); + JSONArray items = root.optJSONObject("data").optJSONArray("module_list_datas").optJSONObject(0).optJSONArray("module_datas").optJSONObject(0).optJSONObject("item_data_lists").optJSONArray("item_datas"); + for (int i = 0; i < items.size(); i++) { + JSONObject item = items.optJSONObject(i); + if(item.optInt("item_type")!=1){ + continue; + } + + JSONObject item_params = item.optJSONObject("item_params"); + TencentCoverInfoVO.EpisodeVO vo = new TencentCoverInfoVO.EpisodeVO(); + vo.setCid(item_params.optString("cid")); + vo.setDuration(item_params.optInt("duration")); + vo.setEpIndex(item_params.optInt("title") - 1); + vo.setFullTitle(item_params.optString("union_title")); + vo.setIndex(item_params.optInt("title") - 1); + vo.setIsNoStoreWatchHistory(item_params.optInt("is_no_store_watch_history") > 0); + vo.setItemType(item.optString("item_type")); + vo.setPic(item_params.optString("image_url")); + vo.setPlayTitle(item_params.optString("play_title")); + vo.setTitle(item_params.optString("title")); + vo.setVid(item_params.optString("vid")); + if(!vo.isIsNoStoreWatchHistory()){ + voList.add(vo); + } + + } + return voList; + + } + + /** + * @author hxh + * @description 鏄惁鍦ㄧ嚎 + * @date 17:39 2024/8/16 + * @param: url + * @return boolean + **/ + public static boolean isOnLine(String url) throws Exception{ + Document doc = Jsoup.connect(url).timeout(10000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36").get(); + return doc.getElementsByClass("page_404").size()==0; + + } + + public static TencentCoverInfoVO getCoverInfo(String url) throws Exception { String script = null; Document doc = Jsoup.connect(url).timeout(10000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36").get(); Elements els = doc.getElementsByTag("script"); for (int i = 0; i < els.size(); i++) { - if (els.get(i).html().indexOf("var COVER_INFO") > -1) { + if (els.get(i).html().indexOf("window.__PINIA__") > -1) { script = els.get(i).html(); break; } @@ -81,7 +139,7 @@ script = script.replace("window.", "_window."); script = " var _window={};" + "var document={getElementsByTagName:function(tag){return [''];}};" + script; - script += "\n function getCoverInfo(){return COVER_INFO }; function getVideoInfo(){return VIDEO_INFO}"; + script += "\n function getCoverInfo(){return _window.__PINIA__.global.coverInfo;}; function getVideoInfo(){return _window.__PINIA__.episodeMain.listData;}";//episodeMain.listData[0].list[0]; ScriptEngineManager manager = new ScriptEngineManager(); jsEngine = manager.getEngineByName("javascript"); @@ -97,65 +155,102 @@ Object coverInfo = in.invokeFunction("getCoverInfo"); JSONObject root = JSONObject.fromObject(gson.toJson(coverInfo)); System.out.println(root.toString()); - List<TencentCoverInfo.VipIdsBean> vipIdsBeans = new ArrayList<>(); + List<TencentCoverInfoVO.EpisodeVO> vipIdsBeans = new ArrayList<>(); - if (root.optJSONObject("vip_ids").isArray()) { - Type type = new TypeToken<List<TencentCoverInfo.VipIdsBean>>() { - }.getType(); - vipIdsBeans = gson.fromJson(root.optJSONArray("vip_ids").toString(), type); + Object videoInfo = in.invokeFunction("getVideoInfo"); + Object tabs = null; + if (videoInfo instanceof Map) { + Map videoInfoMap = (Map) videoInfo; + String key = videoInfoMap.keySet().iterator().next().toString(); + videoInfoMap = (Map) videoInfoMap.get(key); + tabs = videoInfoMap.get("tabs"); + videoInfoMap = (Map) videoInfoMap.get("list"); + + + key = videoInfoMap.keySet().iterator().next().toString(); + videoInfo = videoInfoMap.get(key); + } else if (videoInfo instanceof List) { + List tempList = (List) videoInfo; + tabs = ((Map) tempList.get(0)).get("tabs"); + tempList = (List) (((Map) tempList.get(0)).get("list")); + videoInfo = tempList.get(0); + } + tabs = toList(tabs); + TencentCoverInfoVO coverInfoBean = gson.fromJson(root.toString(), TencentCoverInfoVO.class); + if (tabs != null&&((List) tabs).size()>0) { + // 闇�瑕佽幏鍙栭泦 + JSONArray tabArrays = JSONArray.fromObject(gson.toJson(tabs)); + for (int i = 0; i < tabArrays.size(); i++) { + String pageContext = tabArrays.optJSONObject(i).optString("pageContext"); + List<TencentCoverInfoVO.EpisodeVO> epList = getPageData(coverInfoBean.getCover_id(), pageContext); + if (epList != null) { + vipIdsBeans.addAll(epList); + } + } } else { - JSONObject vipIds = root.optJSONObject("vip_ids"); - for (Iterator<String> its = vipIds.keys(); its.hasNext(); ) { - String p = its.next(); - TencentCoverInfo.VipIdsBean idsBean = gson.fromJson(vipIds.optJSONObject(p).toString(), TencentCoverInfo.VipIdsBean.class); - idsBean.setP(Integer.parseInt(p)); - vipIdsBeans.add(idsBean); + System.out.printf(gson.toJson(videoInfo)); + JSONObject episodeObj = JSONObject.fromObject(gson.toJson(videoInfo)); + for (Object k : episodeObj.keySet()) { + JSONObject item = episodeObj.optJSONObject(k.toString()); + if (item.optBoolean("isNoStoreWatchHistory")) { + continue; + } + vipIdsBeans.add(gson.fromJson(item.toString(), TencentCoverInfoVO.EpisodeVO.class)); } } - root.remove("vip_ids"); try { - List<String> subTypes = objToArray(root, "subtype"); - List<String> directors = objToArray(root, "director"); - List<String> leadingActor = objToArray(root, "leading_actor"); - List<String> subGenre = objToArray(root, "sub_genre"); - - TencentCoverInfo coverInfoBean = gson.fromJson(root.toString(), TencentCoverInfo.class); - coverInfoBean.setVip_ids(vipIdsBeans); - coverInfoBean.setSubtype(subTypes); - coverInfoBean.setDirector(directors); - coverInfoBean.setLeading_actor(leadingActor); - coverInfoBean.setSub_genre(subGenre); + coverInfoBean.setEpisodes(vipIdsBeans); return coverInfoBean; } catch (Exception e) { e.printStackTrace(); } -// System.out.println(coverInfoBean); } return null; } - public static List<TencentCoverInfo> getVideoList(String url) throws Exception { - List<TencentCoverInfo> coverInfoList = new ArrayList<>(); - List<TencentWebUtil.TencentWebVideoInfo> list = TencentWebUtil.getVideoList(url); + public static List<TencentCoverInfoVO> getVideoList(int channelId, int pageIndex, Integer areaId) throws Exception { + List<TencentCoverInfoVO> coverInfoList = new ArrayList<>(); + List<TencentWebUtil.TencentWebVideoInfo> list = TencentWebUtil.getVideoList(channelId, pageIndex,areaId); + loggerDebug.info("鑵捐瑙嗛锛氳幏鍙栬棰戝垪琛�-{}:{}",channelId,list.size()); for (TencentWebUtil.TencentWebVideoInfo info : list) { try { - TencentCoverInfo coverInfo = getCoverInfo(info.getPlayUrl()); - coverInfo.setVertical_pic_url(info.getPicture()); + TencentCoverInfoVO coverInfo = getCoverInfo(info.getPlayUrl()); + if(StringUtil.isNullOrEmpty(coverInfo.getPublish_date())){ + coverInfo.setPublish_date(info.getEpsodePubtime().substring(0,10)); + } + coverInfo.setEpisode_update(info.getTag()); + coverInfo.setNew_pic_vt(info.getPicture()); coverInfoList.add(coverInfo); } catch (Exception e) { e.printStackTrace(); + loggerDebug.error("鑵捐瑙嗛锛氳幏鍙栬鎯呭嚭閿�-"+info.getTitle(),e); + }finally { + Thread.sleep(2000); } } return coverInfoList; } - public static List<TencentCoverInfo> getVideoListByCategory(String channel, int page) { - String url = String.format("https://v.qq.com/x/bu/pagesheet/list?_all=1&append=1&channel=%s&ipay=2&listpage=%s&offset=%s&pagesize=30&sort=18", channel, page, (page - 1) * 30); + private static List toList(Object data){ + if(data instanceof Map){ + List list=new ArrayList(); + Map map=(Map) data; + for(Object key:map.keySet()){ + list.add(map.get(key)); + } + return list; + }else if(data instanceof List){ + return (List) data; + } + return null; + } + + public static List<TencentCoverInfoVO> getVideoListByCategory(String channel, int page,Integer areaId) { try { - return getVideoList(url); + return getVideoList(TencentWebUtil.CHANNEL_ID_MAP.get(channel), page - 1, areaId); } catch (Exception e) { e.printStackTrace(); } @@ -164,29 +259,14 @@ public static void main(String[] args) throws Exception { - List<TencentCoverInfo> list1 = getVideoList(TencentWebUtil.getApiUrl("https://v.qq.com/channel/tv?_all=1&channel=tv&iarea=818&listpage=1&sort=18", 1)); - System.out.println(list1); - - if (1 > 0) - return; - -// try { - TencentCoverInfo info = getCoverInfo("https://v.qq.com/x/cover/7q544xyrava3vxf.html"); - System.out.println(info); -// -// info = getCoverInfo("https://v.qq.com/x/cover/mzc00200s0ii272/r0036xibgw2.html"); -// info = getCoverInfo("https://v.qq.com/x/cover/ylgl3m6wo0sypou/w0036x9c5c7.html"); -// info = getCoverInfo("https://v.qq.com/x/cover/ylgl3m6wo0sypou/w0036x9c5c7.html"); -// // -// info = getCoverInfo("https://v.qq.com/x/cover/mzc00200tlv15ub.html"); -// System.out.println(info); -// } catch (Exception e) { -// e.printStackTrace(); - +// getPageData("mzc002007j7p5hn", "chapter_name=&cid=mzc002007j7p5hn&detail_page_type=1&episode_begin=1&episode_end=30&episode_step=30&filter_rule_id=&id_type=1&is_nocopyright=false&is_skp_style=false&lid=&list_page_context=&need_tab=1&order=&page_num=0&page_size=30&req_from=web_vsite&req_from_second_type=&req_type=0&siteName=&tab_type=1&title_style=&ui_type=null&un_strategy_id=13dc6f30819942eb805250fb671fb082&watch_together_pay_status=0&year="); + List<TencentCoverInfoVO> voList = getVideoListByCategory("鐢佃鍓�",1, null); +// for(TencentCoverInfoVO vo:voList){ +// System.out.println(vo.getTitle()); // } - int page = 1; - List<TencentCoverInfo> list = getVideoListByCategory("cartoon", 1); - System.out.println(list); +// System.out.printf(voList.toString()); +// getCoverInfo("https://v.qq.com/x/cover/mzc002007j7p5hn/z0047b7g57k.html"); + getCoverInfo("https://v.qq.com/x/cover/mcv8hkc8zk8lnov/r0048jp1a5e.html"); } -- Gitblit v1.8.0