From ab35ac8b769b2d9816dffb33a64f2c6f7bd5dd6e Mon Sep 17 00:00:00 2001 From: admin <weikou2014> Date: 星期四, 05 九月 2024 17:05:55 +0800 Subject: [PATCH] 风行网页版爬虫 --- src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java | 259 ++++++++++++++++++++++++++------------------------- 1 files changed, 133 insertions(+), 126 deletions(-) diff --git a/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java b/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java index 8b10ee6..d24b862 100644 --- a/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java +++ b/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java @@ -1,5 +1,6 @@ package com.yeshi.buwan.videos.hanmi; +import com.yeshi.buwan.util.StringUtil; import com.yeshi.buwan.videos.hanmi.entity.HanmiShow; import com.yeshi.buwan.videos.hanmi.entity.HanmiShowEpisode; import org.jsoup.Connection; @@ -9,6 +10,9 @@ import org.jsoup.select.Elements; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URLEncoder; import java.util.*; public class HanmiApiUtil { @@ -42,45 +46,67 @@ } public static HanmiShow parseShowDetail(HanmiShow show) throws Exception { - if (show.getUrl() == null || !show.getUrl().startsWith("https://www.hmtv.me/show/")) { + if (show.getUrl() == null || !show.getUrl().startsWith("https://www.wztaichuan.com/vod/detail")) { throw new Exception("閾炬帴涓嶅悎娉�"); } + + URI uri = URI.create(show.getUrl()); + Document doc = getDoc(show.getUrl(), getHeaders()); - Element root = doc.getElementsByClass("video-content").get(0); - Element titleItem = root.getElementsByClass("article-title").get(0); + Element root = doc.getElementsByClass("stui-content__thumb").get(0).parent(); + + String picture = doc.getElementsByClass("stui-content__thumb").get(0).getElementsByTag("img").get(0).attr("data-original"); + + //鑺傜洰淇℃伅 + Element videoInfo = root.getElementsByClass("stui-content__detail").get(0); + + Element titleItem = videoInfo.getElementsByClass("title").get(0); //鏍囬 String title = null; try { - title = titleItem.getElementsByClass("item-title").get(0).ownText(); + title = titleItem.ownText(); } catch (IndexOutOfBoundsException e) { } - String year = null; + String score = null; try { - year = titleItem.getElementsByClass("item-year").get(0).ownText(); + score = titleItem.getElementsByClass("score").get(0).ownText(); } catch (IndexOutOfBoundsException e) { } + show.setScore(score); - //鑺傜洰淇℃伅 - Element videoBox = root.getElementsByClass("video_box").get(0); - - String picture = videoBox.getElementsByClass("video_img").get(0).getElementsByTag("img").attr("src"); - - Element videoInfo = videoBox.getElementsByClass("video_info").get(0); - String videoInfoStr = videoInfo.html(); - String[] sts = videoInfoStr.split("<br>"); Map<String, String> infos = new HashMap<>(); - for (String st : sts) { - Document d = Jsoup.parse(st); - String value = d.text(); - if (value.indexOf(":") > -1) - infos.put(value.substring(0, value.indexOf(":")).trim(), value.substring(value.indexOf(":") + 1).trim()); + Elements datas = videoInfo.getElementsByClass("data"); + for(int i=0;i<datas.size();i++){ + Elements data_items = datas.get(i).getElementsByClass("text-muted"); + for(int j=0; j<data_items.size(); j++){ + String key = data_items.get(j).ownText().trim(); + String value = null; + switch (key){ + case "涓绘紨锛�": + List<String> actors=new ArrayList<>(); + Elements temps = data_items.get(j).parent().getElementsByTag("a"); + for(Iterator<Element> its = temps.iterator(); its.hasNext();) + { + actors.add( its.next().ownText()); + } + value = StringUtil.join(actors, ","); + break; + default: + if(data_items.get(j).nextElementSibling()!=null) { + value = data_items.get(j).nextElementSibling().ownText(); + } else{ + value = data_items.get(j).parent().ownText(); + } + } + infos.put(key,value); + } } - + String desc = videoInfo.getElementsByClass("desc").get(0).ownText(); //鍓ч泦鍒楄〃 - Element eposide = root.getElementsByClass("video_list_li").get(0); + Element eposide = doc.getElementsByClass("playlist").get(0).getElementsByClass("stui-content__playlist").get(0); Elements eposides = eposide.getElementsByTag("a"); List<HanmiShowEpisode> episodeList = new ArrayList<>(); @@ -88,17 +114,10 @@ //鐢靛奖 if (show.getType() != null && show.getType().contains("褰�")) { int index = 0; -// for (int i = 0; i < eposides.size(); i++) { -// String tag = eposides.get(i).ownText(); -// if (tag.contains("HD")) { -// index = i; -// break; -// } -// } String href = eposides.get(index).attr("href"); HanmiShowEpisode ep = new HanmiShowEpisode(); ep.setOrderBy(0); - ep.setPlayUrl("https://www.hmtv.me" + href); + ep.setPlayUrl(String.format("%s://%s%s",uri.getScheme(),uri.getHost(),href)); ep.setTag(show.getTitle() != null ? show.getTitle() : title); episodeList.add(ep); } else { @@ -107,75 +126,35 @@ String tag = eposides.get(i).ownText(); HanmiShowEpisode ep = new HanmiShowEpisode(); ep.setOrderBy(i + 1); - ep.setPlayUrl("https://www.hmtv.me" + href); + ep.setPlayUrl(String.format("%s://%s%s",uri.getScheme(),uri.getHost(),href)); ep.setTag(tag); episodeList.add(ep); } } - //绠�浠� - String desc = root.getElementsByClass("jianjie").get(0).text(); - - if (show.getTitle() == null) show.setTitle(title.split(" ")[0]); show.setPicture(picture); - if (infos.get("涓绘紨") != null) - show.setActors(infos.get("涓绘紨"). - + if (infos.get("涓绘紨锛�") != null) + show.setActors(infos.get("涓绘紨锛�"). replace("/", ",")); - if (infos.get("瀵兼紨") != null) - show.setDirector(infos.get("瀵兼紨")); - if (infos.get("绫诲瀷") != null) - show.setCategorys(infos.get("绫诲瀷")); - if (infos.get("鍥藉/鍦板尯") != null) - show.setArea(infos.get("鍥藉/鍦板尯")); - if (infos.get("棣栨挱") != null) - show.setRelaseDate(infos.get("棣栨挱"). - - substring(0, infos.get("棣栨挱"). - - indexOf("(") > -1 ? infos.get("棣栨挱"). - - indexOf("(") : infos.get("棣栨挱"). - - length())); - if (infos.get("涓婃槧鏃ユ湡") != null) { - show.setRelaseDate(infos.get("涓婃槧鏃ユ湡"). - - substring(0, infos.get("涓婃槧鏃ユ湡"). - - indexOf("(") > -1 ? infos.get("涓婃槧鏃ユ湡"). - - indexOf("(") : infos.get("涓婃槧鏃ユ湡"). - - length())); - } - - - if (year == null && show.getRelaseDate() != null) { - year = show.getRelaseDate().split("-")[0]; - } - - show.setYear(year.replace("(", ""). - replace(")", "")); - + if (infos.get("瀵兼紨锛�") != null) + show.setDirector(infos.get("瀵兼紨锛�")); + if (infos.get("绫诲瀷锛�") != null) + show.setCategorys(infos.get("绫诲瀷锛�")); + if (infos.get("鍦板尯锛�") != null) + show.setArea(infos.get("鍦板尯锛�")); + show.setDesc(desc); + show.setYear(infos.get("骞翠唤锛�")); if (show.getYear() != null && show.getRelaseDate() == null) { show.setRelaseDate(show.getYear() + "-01-01"); } - - show.setId(show.getUrl(). - - replace("https://www.hmtv.me/show/", ""). - - trim()); + show.setId(show.getUrl().split("/id/")[1].split("/")[0].split("\\.")[0].trim()); show.setEpisodeList(episodeList); show.setUrl(show.getUrl()); - show.setDesc(desc.trim()); return show; } - public static List<HanmiShow> parseList(String listUrl) throws IOException { Map<String, String> headers = new HashMap<>(); @@ -188,28 +167,29 @@ List<HanmiShow> list = new ArrayList<>(); Document doc = getDoc(listUrl, headers); - String type = doc.getElementsByClass("list-content").get(0).getElementsByClass("title").get(0).getElementsByTag("strong").text(); - - Element root = doc.getElementsByClass("m-movies").get(0); - Elements items = root.getElementsByClass("u-movie"); + Elements es = doc.getElementsByClass("stui-pannel_bd"); + Element root = null; + for(int i=0;i<es.size();i++){ + if( es.get(i).select("ul.stui-vodlist").size()>0){ + root = es.get(i).select("ul.stui-vodlist").get(0); + break; + } + } + Elements items = root.getElementsByTag("li"); for (int i = 0; i < items.size(); i++) { Element item = items.get(i); HanmiShow show = new HanmiShow(); - show.setUrl(item.getElementsByTag("a").get(0).attr("href")); - show.setTag(item.getElementsByClass("zhuangtai").get(0).text()); - String score = item.getElementsByClass("pingfen").get(0).text(); - if (score != null) { - score = score.replace("鍒�", ""); - show.setScore(score); + String url = item.getElementsByTag("a").get(0).attr("href"); + if(!url.startsWith("http")){ + URI uri = URI.create(listUrl); + url=String.format("%s://%s%s",uri.getScheme(),uri.getHost(),url); } - show.setTitle(item.getElementsByTag("h2").get(0).getElementsByTag("a").get(0).ownText()); - show.setType(type); - show.setId(show.getUrl(). - replace("https://www.hmtv.me/show/", ""). - trim()); + show.setUrl(url); + show.setTag(item.getElementsByClass("pic-text").get(0).text()); + show.setTitle(item.getElementsByClass("stui-vodlist__detail").get(0).getElementsByTag("a").get(0).ownText()); + show.setId(show.getUrl().split("/")[show.getUrl().split("/").length-1].split("\\.")[0].trim()); list.add(show); } - return list; } @@ -221,31 +201,26 @@ * @return */ public static List<HanmiShowEpisode> getShowEpisodesFromPlayUrl(String playUrl) throws IOException { - List<HanmiShowEpisode> episodeList = new ArrayList<>(); - Document doc = getDoc(playUrl, null); - Element els = doc.getElementById("playnav"); - Elements items = els.getElementsByTag("li"); - int playIndex = -1; - for (int i = 0; i < items.size(); i++) { - String name = items.get(i).text(); - if (name.contains("HM")) { - playIndex = i; - break; - } - } - if (playIndex < 0) - return null; - Element tab = doc.getElementById("playcontainer").getElementsByClass("tab").get(playIndex); - Elements es = tab.getElementsByTag("a"); + URI uri = URI.create(playUrl); - for (int i = 0; i < es.size(); i++) { - HanmiShowEpisode episode = new HanmiShowEpisode(); - String href = "https://www.hmtv.me" + es.get(i).attr("href"); - String name = es.get(i).text(); - episode.setTag(name); - episode.setPlayUrl(href); - episode.setOrderBy(i + 1); - episodeList.add(episode); + Document doc = getDoc(playUrl, getHeaders()); + + Element root = doc.getElementsByClass("stui-content__thumb").get(0).parent(); + + String picture = doc.getElementsByClass("stui-content__thumb").get(0).getElementsByTag("img").get(0).attr("data-original"); + + //鍓ч泦鍒楄〃 + Element eposide = doc.getElementsByClass("playlist").get(0).getElementsByClass("stui-content__playlist").get(0); + Elements eposides = eposide.getElementsByTag("a"); + List<HanmiShowEpisode> episodeList = new ArrayList<>(); + for (int i = 0; i < eposides.size(); i++) { + String href = eposides.get(i).attr("href"); + String tag = eposides.get(i).ownText(); + HanmiShowEpisode ep = new HanmiShowEpisode(); + ep.setOrderBy(i + 1); + ep.setPlayUrl(String.format("%s://%s%s",uri.getScheme(),uri.getHost(),href)); + ep.setTag(tag); + episodeList.add(ep); } return episodeList; } @@ -255,9 +230,6 @@ List<HanmiShow> list = new ArrayList<>(); for (HanmiShow show : showList) { try { - if (!show.getUrl().startsWith("http")) { - show.setUrl("https://www.hmtv.me" + show.getUrl()); - } list.add(parseShowDetail(show)); } catch (Exception e) { e.printStackTrace(); @@ -266,9 +238,44 @@ return list; } + + public static void parseDY() throws UnsupportedEncodingException { + for(int p=1;p<20;p++) { + String url = String.format("https://www.wztaichuan.com/vod/show/area/%s/id/1/page/%d.html", URLEncoder.encode("闊╁浗","UTF-8"), p); + try { + List<HanmiShow> showList = parseList(url); + showList = parseDetailList(showList); + for(HanmiShow show:showList){ + show.setTag("璇勫垎锛�"+show.getScore()); + show.setCategorys("鐢靛奖"); + } + System.out.println("鐢靛奖:"+p +"-"+showList.size()); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + + + + public static void main(String[] args) throws Exception { - List<HanmiShowEpisode> list = getShowEpisodesFromPlayUrl("https://www.hmtv.me/vplay/MTExNS0xLTA=.html"); - System.out.println(list); +// List<HanmiShowEpisode> list = getShowEpisodesFromPlayUrl("https://www.hmtv.me/vplay/MTExNS0xLTA=.html"); +// System.out.println(list); +// List<HanmiShow> showList = HanmiApiUtil.parseDetailList(HanmiApiUtil.parseList("https://www.hanjutv.me/hanju/page/" + 50)); +// System.out.printf(showList.toString()); + +// URI uri = URI.create("https://www.hanjutv.me/s/1579"); +// System.out.println(uri.getScheme()); +// parseList("https://www.wztaichuan.com/vod/type/id/5/page/1.html"); + parseDY(); + +// +// System.out.println(uri.getHost()); +// HanmiShow show=new HanmiShow(); +// show.setUrl("https://www.wztaichuan.com/vod/detail/id/10195.html"); +// show = parseShowDetail(show); +// System.out.println(show); } } -- Gitblit v1.8.0