From ab35ac8b769b2d9816dffb33a64f2c6f7bd5dd6e Mon Sep 17 00:00:00 2001
From: admin <weikou2014>
Date: 星期四, 05 九月 2024 17:05:55 +0800
Subject: [PATCH] 风行网页版爬虫

---
 src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java |  259 ++++++++++++++++++++++++++-------------------------
 1 files changed, 133 insertions(+), 126 deletions(-)

diff --git a/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java b/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java
index 8b10ee6..d24b862 100644
--- a/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java
+++ b/src/main/java/com/yeshi/buwan/videos/hanmi/HanmiApiUtil.java
@@ -1,5 +1,6 @@
 package com.yeshi.buwan.videos.hanmi;
 
+import com.yeshi.buwan.util.StringUtil;
 import com.yeshi.buwan.videos.hanmi.entity.HanmiShow;
 import com.yeshi.buwan.videos.hanmi.entity.HanmiShowEpisode;
 import org.jsoup.Connection;
@@ -9,6 +10,9 @@
 import org.jsoup.select.Elements;
 
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URLEncoder;
 import java.util.*;
 
 public class HanmiApiUtil {
@@ -42,45 +46,67 @@
     }
 
     public static HanmiShow parseShowDetail(HanmiShow show) throws Exception {
-        if (show.getUrl() == null || !show.getUrl().startsWith("https://www.hmtv.me/show/")) {
+        if (show.getUrl() == null || !show.getUrl().startsWith("https://www.wztaichuan.com/vod/detail")) {
             throw new Exception("閾炬帴涓嶅悎娉�");
         }
+
+        URI uri =  URI.create(show.getUrl());
+
         Document doc = getDoc(show.getUrl(), getHeaders());
 
-        Element root = doc.getElementsByClass("video-content").get(0);
-        Element titleItem = root.getElementsByClass("article-title").get(0);
+        Element root = doc.getElementsByClass("stui-content__thumb").get(0).parent();
+
+        String picture = doc.getElementsByClass("stui-content__thumb").get(0).getElementsByTag("img").get(0).attr("data-original");
+
+        //鑺傜洰淇℃伅
+        Element videoInfo = root.getElementsByClass("stui-content__detail").get(0);
+
+        Element titleItem = videoInfo.getElementsByClass("title").get(0);
 
         //鏍囬
         String title = null;
         try {
-            title = titleItem.getElementsByClass("item-title").get(0).ownText();
+            title = titleItem.ownText();
         } catch (IndexOutOfBoundsException e) {
         }
 
-        String year = null;
+        String score = null;
         try {
-            year = titleItem.getElementsByClass("item-year").get(0).ownText();
+            score = titleItem.getElementsByClass("score").get(0).ownText();
         } catch (IndexOutOfBoundsException e) {
         }
+        show.setScore(score);
 
-        //鑺傜洰淇℃伅
-        Element videoBox = root.getElementsByClass("video_box").get(0);
-
-        String picture = videoBox.getElementsByClass("video_img").get(0).getElementsByTag("img").attr("src");
-
-        Element videoInfo = videoBox.getElementsByClass("video_info").get(0);
-        String videoInfoStr = videoInfo.html();
-        String[] sts = videoInfoStr.split("<br>");
         Map<String, String> infos = new HashMap<>();
-        for (String st : sts) {
-            Document d = Jsoup.parse(st);
-            String value = d.text();
-            if (value.indexOf(":") > -1)
-                infos.put(value.substring(0, value.indexOf(":")).trim(), value.substring(value.indexOf(":") + 1).trim());
+        Elements datas = videoInfo.getElementsByClass("data");
+        for(int i=0;i<datas.size();i++){
+            Elements data_items = datas.get(i).getElementsByClass("text-muted");
+            for(int j=0; j<data_items.size(); j++){
+                String key = data_items.get(j).ownText().trim();
+                String value = null;
+                switch (key){
+                    case "涓绘紨锛�":
+                        List<String> actors=new ArrayList<>();
+                        Elements temps =  data_items.get(j).parent().getElementsByTag("a");
+                        for(Iterator<Element> its = temps.iterator(); its.hasNext();)
+                        {
+                            actors.add( its.next().ownText());
+                        }
+                        value = StringUtil.join(actors, ",");
+                        break;
+                     default:
+                         if(data_items.get(j).nextElementSibling()!=null) {
+                             value = data_items.get(j).nextElementSibling().ownText();
+                         } else{
+                             value =  data_items.get(j).parent().ownText();
+                         }
+                }
+                infos.put(key,value);
+            }
         }
-
+        String desc = videoInfo.getElementsByClass("desc").get(0).ownText();
         //鍓ч泦鍒楄〃
-        Element eposide = root.getElementsByClass("video_list_li").get(0);
+        Element eposide = doc.getElementsByClass("playlist").get(0).getElementsByClass("stui-content__playlist").get(0);
         Elements eposides = eposide.getElementsByTag("a");
         List<HanmiShowEpisode> episodeList = new ArrayList<>();
 
@@ -88,17 +114,10 @@
         //鐢靛奖
         if (show.getType() != null && show.getType().contains("褰�")) {
             int index = 0;
-//            for (int i = 0; i < eposides.size(); i++) {
-//                String tag = eposides.get(i).ownText();
-//                if (tag.contains("HD")) {
-//                    index = i;
-//                    break;
-//                }
-//            }
             String href = eposides.get(index).attr("href");
             HanmiShowEpisode ep = new HanmiShowEpisode();
             ep.setOrderBy(0);
-            ep.setPlayUrl("https://www.hmtv.me" + href);
+            ep.setPlayUrl(String.format("%s://%s%s",uri.getScheme(),uri.getHost(),href));
             ep.setTag(show.getTitle() != null ? show.getTitle() : title);
             episodeList.add(ep);
         } else {
@@ -107,75 +126,35 @@
                 String tag = eposides.get(i).ownText();
                 HanmiShowEpisode ep = new HanmiShowEpisode();
                 ep.setOrderBy(i + 1);
-                ep.setPlayUrl("https://www.hmtv.me" + href);
+                ep.setPlayUrl(String.format("%s://%s%s",uri.getScheme(),uri.getHost(),href));
                 ep.setTag(tag);
                 episodeList.add(ep);
             }
         }
 
-        //绠�浠�
-        String desc = root.getElementsByClass("jianjie").get(0).text();
-
-
         if (show.getTitle() == null)
             show.setTitle(title.split(" ")[0]);
 
         show.setPicture(picture);
-        if (infos.get("涓绘紨") != null)
-            show.setActors(infos.get("涓绘紨").
-
+        if (infos.get("涓绘紨锛�") != null)
+            show.setActors(infos.get("涓绘紨锛�").
                     replace("/", ","));
-        if (infos.get("瀵兼紨") != null)
-            show.setDirector(infos.get("瀵兼紨"));
-        if (infos.get("绫诲瀷") != null)
-            show.setCategorys(infos.get("绫诲瀷"));
-        if (infos.get("鍥藉/鍦板尯") != null)
-            show.setArea(infos.get("鍥藉/鍦板尯"));
-        if (infos.get("棣栨挱") != null)
-            show.setRelaseDate(infos.get("棣栨挱").
-
-                    substring(0, infos.get("棣栨挱").
-
-                            indexOf("(") > -1 ? infos.get("棣栨挱").
-
-                            indexOf("(") : infos.get("棣栨挱").
-
-                            length()));
-        if (infos.get("涓婃槧鏃ユ湡") != null) {
-            show.setRelaseDate(infos.get("涓婃槧鏃ユ湡").
-
-                    substring(0, infos.get("涓婃槧鏃ユ湡").
-
-                            indexOf("(") > -1 ? infos.get("涓婃槧鏃ユ湡").
-
-                            indexOf("(") : infos.get("涓婃槧鏃ユ湡").
-
-                            length()));
-        }
-
-
-        if (year == null && show.getRelaseDate() != null) {
-            year = show.getRelaseDate().split("-")[0];
-        }
-
-        show.setYear(year.replace("(", "").
-                replace(")", ""));
-
+        if (infos.get("瀵兼紨锛�") != null)
+            show.setDirector(infos.get("瀵兼紨锛�"));
+        if (infos.get("绫诲瀷锛�") != null)
+            show.setCategorys(infos.get("绫诲瀷锛�"));
+        if (infos.get("鍦板尯锛�") != null)
+            show.setArea(infos.get("鍦板尯锛�"));
+        show.setDesc(desc);
+        show.setYear(infos.get("骞翠唤锛�"));
         if (show.getYear() != null && show.getRelaseDate() == null) {
             show.setRelaseDate(show.getYear() + "-01-01");
         }
-
-        show.setId(show.getUrl().
-
-                replace("https://www.hmtv.me/show/", "").
-
-                trim());
+        show.setId(show.getUrl().split("/id/")[1].split("/")[0].split("\\.")[0].trim());
         show.setEpisodeList(episodeList);
         show.setUrl(show.getUrl());
-        show.setDesc(desc.trim());
         return show;
     }
-
 
     public static List<HanmiShow> parseList(String listUrl) throws IOException {
         Map<String, String> headers = new HashMap<>();
@@ -188,28 +167,29 @@
 
         List<HanmiShow> list = new ArrayList<>();
         Document doc = getDoc(listUrl, headers);
-        String type = doc.getElementsByClass("list-content").get(0).getElementsByClass("title").get(0).getElementsByTag("strong").text();
-
-        Element root = doc.getElementsByClass("m-movies").get(0);
-        Elements items = root.getElementsByClass("u-movie");
+        Elements es =  doc.getElementsByClass("stui-pannel_bd");
+        Element root = null;
+        for(int i=0;i<es.size();i++){
+          if(  es.get(i).select("ul.stui-vodlist").size()>0){
+              root = es.get(i).select("ul.stui-vodlist").get(0);
+              break;
+          }
+        }
+        Elements items = root.getElementsByTag("li");
         for (int i = 0; i < items.size(); i++) {
             Element item = items.get(i);
             HanmiShow show = new HanmiShow();
-            show.setUrl(item.getElementsByTag("a").get(0).attr("href"));
-            show.setTag(item.getElementsByClass("zhuangtai").get(0).text());
-            String score = item.getElementsByClass("pingfen").get(0).text();
-            if (score != null) {
-                score = score.replace("鍒�", "");
-                show.setScore(score);
+            String url = item.getElementsByTag("a").get(0).attr("href");
+            if(!url.startsWith("http")){
+                URI uri = URI.create(listUrl);
+                url=String.format("%s://%s%s",uri.getScheme(),uri.getHost(),url);
             }
-            show.setTitle(item.getElementsByTag("h2").get(0).getElementsByTag("a").get(0).ownText());
-            show.setType(type);
-            show.setId(show.getUrl().
-                    replace("https://www.hmtv.me/show/", "").
-                    trim());
+            show.setUrl(url);
+            show.setTag(item.getElementsByClass("pic-text").get(0).text());
+            show.setTitle(item.getElementsByClass("stui-vodlist__detail").get(0).getElementsByTag("a").get(0).ownText());
+            show.setId(show.getUrl().split("/")[show.getUrl().split("/").length-1].split("\\.")[0].trim());
             list.add(show);
         }
-
         return list;
     }
 
@@ -221,31 +201,26 @@
      * @return
      */
     public static List<HanmiShowEpisode> getShowEpisodesFromPlayUrl(String playUrl) throws IOException {
-        List<HanmiShowEpisode> episodeList = new ArrayList<>();
-        Document doc = getDoc(playUrl, null);
-        Element els = doc.getElementById("playnav");
-        Elements items = els.getElementsByTag("li");
-        int playIndex = -1;
-        for (int i = 0; i < items.size(); i++) {
-            String name = items.get(i).text();
-            if (name.contains("HM")) {
-                playIndex = i;
-                break;
-            }
-        }
-        if (playIndex < 0)
-            return null;
-        Element tab = doc.getElementById("playcontainer").getElementsByClass("tab").get(playIndex);
-        Elements es = tab.getElementsByTag("a");
+        URI uri =  URI.create(playUrl);
 
-        for (int i = 0; i < es.size(); i++) {
-            HanmiShowEpisode episode = new HanmiShowEpisode();
-            String href = "https://www.hmtv.me" + es.get(i).attr("href");
-            String name = es.get(i).text();
-            episode.setTag(name);
-            episode.setPlayUrl(href);
-            episode.setOrderBy(i + 1);
-            episodeList.add(episode);
+        Document doc = getDoc(playUrl, getHeaders());
+
+        Element root = doc.getElementsByClass("stui-content__thumb").get(0).parent();
+
+        String picture = doc.getElementsByClass("stui-content__thumb").get(0).getElementsByTag("img").get(0).attr("data-original");
+
+        //鍓ч泦鍒楄〃
+        Element eposide = doc.getElementsByClass("playlist").get(0).getElementsByClass("stui-content__playlist").get(0);
+        Elements eposides = eposide.getElementsByTag("a");
+        List<HanmiShowEpisode> episodeList = new ArrayList<>();
+        for (int i = 0; i < eposides.size(); i++) {
+                String href = eposides.get(i).attr("href");
+                String tag = eposides.get(i).ownText();
+                HanmiShowEpisode ep = new HanmiShowEpisode();
+                ep.setOrderBy(i + 1);
+                ep.setPlayUrl(String.format("%s://%s%s",uri.getScheme(),uri.getHost(),href));
+                ep.setTag(tag);
+                episodeList.add(ep);
         }
         return episodeList;
     }
@@ -255,9 +230,6 @@
         List<HanmiShow> list = new ArrayList<>();
         for (HanmiShow show : showList) {
             try {
-                if (!show.getUrl().startsWith("http")) {
-                    show.setUrl("https://www.hmtv.me" + show.getUrl());
-                }
                 list.add(parseShowDetail(show));
             } catch (Exception e) {
                 e.printStackTrace();
@@ -266,9 +238,44 @@
         return list;
     }
 
+
+    public static void parseDY() throws UnsupportedEncodingException {
+        for(int p=1;p<20;p++) {
+            String url = String.format("https://www.wztaichuan.com/vod/show/area/%s/id/1/page/%d.html", URLEncoder.encode("闊╁浗","UTF-8"), p);
+            try {
+                List<HanmiShow> showList =   parseList(url);
+                showList = parseDetailList(showList);
+                for(HanmiShow show:showList){
+                   show.setTag("璇勫垎锛�"+show.getScore());
+                    show.setCategorys("鐢靛奖");
+                }
+                System.out.println("鐢靛奖:"+p +"-"+showList.size());
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+    }
+
+
+
+
     public static void main(String[] args) throws Exception {
-        List<HanmiShowEpisode> list = getShowEpisodesFromPlayUrl("https://www.hmtv.me/vplay/MTExNS0xLTA=.html");
-        System.out.println(list);
+//        List<HanmiShowEpisode> list = getShowEpisodesFromPlayUrl("https://www.hmtv.me/vplay/MTExNS0xLTA=.html");
+//        System.out.println(list);
+//        List<HanmiShow> showList = HanmiApiUtil.parseDetailList(HanmiApiUtil.parseList("https://www.hanjutv.me/hanju/page/" + 50));
+//        System.out.printf(showList.toString());
+
+//        URI uri =  URI.create("https://www.hanjutv.me/s/1579");
+//        System.out.println(uri.getScheme());
+//          parseList("https://www.wztaichuan.com/vod/type/id/5/page/1.html");
+        parseDY();
+
+//
+//        System.out.println(uri.getHost());
+//        HanmiShow show=new HanmiShow();
+//        show.setUrl("https://www.wztaichuan.com/vod/detail/id/10195.html");
+//        show = parseShowDetail(show);
+//        System.out.println(show);
     }
 
 }

--
Gitblit v1.8.0