From 5e7b0ed4a154ad067cbcf4aa1a1c7cce32f9864c Mon Sep 17 00:00:00 2001 From: admin <weikou2014> Date: 星期五, 26 四月 2024 18:02:17 +0800 Subject: [PATCH] 唯品会链接解析升级 --- fanli/src/main/java/com/yeshi/fanli/util/goods/jd/NYouHuiUtil.java | 234 +++++++++++++++++++++++++++++----------------------------- 1 files changed, 117 insertions(+), 117 deletions(-) diff --git a/fanli/src/main/java/com/yeshi/fanli/util/goods/jd/NYouHuiUtil.java b/fanli/src/main/java/com/yeshi/fanli/util/goods/jd/NYouHuiUtil.java index d04d007..eb056c9 100644 --- a/fanli/src/main/java/com/yeshi/fanli/util/goods/jd/NYouHuiUtil.java +++ b/fanli/src/main/java/com/yeshi/fanli/util/goods/jd/NYouHuiUtil.java @@ -1,117 +1,117 @@ -package com.yeshi.fanli.util.goods.jd; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import com.yeshi.fanli.dao.goods.jd.NYouHuiGoods; - -/** - * 浜笢鍐呬紭鎯犲晢鍝佺埇鍙� https://www.n-youhui.com - * - * @author Administrator - * - */ -public class NYouHuiUtil { - - private static Document getDocument(String url) { - Document doc = null; - try { - doc = Jsoup.connect(url).timeout(20000) - .userAgent( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36") - .get(); - } catch (IOException e) { - e.printStackTrace(); - } - return doc; - } - - public static List<String> getClasses() { - List<String> classList = new ArrayList<>(); - Document doc = getDocument("https://www.n-youhui.com"); - Element classesRoot = doc.getElementsByClass("widget_categories").get(0); - Elements level0 = classesRoot.getElementsByClass("level-0"); - for (int i = 0; i < level0.size(); i++) { - String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim(); - classList.add(name); - } - - return classList; - } - - /** - * - * @param className - * @return - */ - public static List<NYouHuiGoods> listByClassName(String className, int page) { - List<NYouHuiGoods> goodsList = new ArrayList<>(); - String url = null; - try { - url = "https://www.n-youhui.com/" + URLEncoder.encode(className, "UTF-8").toLowerCase() + "/page/" + page; - } catch (UnsupportedEncodingException e1) { - e1.printStackTrace(); - } - Document doc = getDocument(url); - Element content = doc.getElementsByClass("content").get(0); - if (content != null) { - Elements articles = content.getElementsByTag("article"); - if (articles != null) - for (int i = 0; i < articles.size(); i++) { - String link = articles.get(i).getElementsByTag("a").get(0).attr("href"); - String name = articles.get(i).getElementsByTag("a").get(0).ownText(); - String time = articles.get(i).getElementsByClass("time").get(0).ownText(); - time = time.split(" ")[time.split(" ").length - 1]; - NYouHuiGoods goods = new NYouHuiGoods(); - goods.setName(name); - goods.setSourceUrl(link); - goods.setPublishTime(time); - goodsList.add(goods); - } - } - return goodsList; - } - - public static NYouHuiGoods getGoodsDetail(String url) { - NYouHuiGoods goods = new NYouHuiGoods(); - Document doc = getDocument(url); - Element article = doc.getElementsByClass("article-content").get(0); - Elements ps = article.getElementsByTag("p"); - String text = ""; - for (int i = 0; i < ps.size(); i++) { - // 绉婚櫎鍥剧墖 - Elements imgList = ps.get(i).getElementsByTag("img"); - for (int j = 0; j < imgList.size(); j++) - imgList.get(j).remove(); - text += ps.get(i).html().replace("<br>", "\n") + "\n"; - } - - text=text.trim(); - - String regex = "(https://u\\.jd\\.com/)[0-9A-Za-z]{1,20}"; - Pattern pattern = Pattern.compile(regex); - Matcher m = pattern.matcher(text); - List<String> urlList = new ArrayList<>(); - while (m.find()) { - urlList.add(m.group()); - } - goods.setLinkList(urlList); - goods.setDesc(text.replaceAll(regex, "[閾炬帴]")); - goods.setName(doc.getElementsByClass("article-title").get(0).getElementsByTag("a").get(0).ownText()); - String time = doc.getElementsByClass("article-meta").get(0).getElementsByTag("li").get(0).ownText().trim(); - goods.setPublishTime(time.split(" ")[time.split(" ").length - 1]); - goods.setSourceUrl(url); - return goods; - } - -} +package com.yeshi.fanli.util.goods.jd; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.yeshi.fanli.entity.goods.jd.NYouHuiGoods; + +/** + * 浜笢鍐呬紭鎯犲晢鍝佺埇鍙� https://www.n-youhui.com + * + * @author Administrator + * + */ +public class NYouHuiUtil { + + private static Document getDocument(String url) { + Document doc = null; + try { + doc = Jsoup.connect(url).timeout(20000) + .userAgent( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36") + .get(); + } catch (IOException e) { + e.printStackTrace(); + } + return doc; + } + + public static List<String> getClasses() { + List<String> classList = new ArrayList<>(); + Document doc = getDocument("https://www.n-youhui.com"); + Element classesRoot = doc.getElementsByClass("widget_categories").get(0); + Elements level0 = classesRoot.getElementsByClass("level-0"); + for (int i = 0; i < level0.size(); i++) { + String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim(); + classList.add(name.replace("/", "-")); + } + + return classList; + } + + /** + * + * @param className + * @return + */ + public static List<NYouHuiGoods> listByClassName(String className, int page) { + List<NYouHuiGoods> goodsList = new ArrayList<>(); + String url = null; + try { + url = "https://www.n-youhui.com/" + URLEncoder.encode(className, "UTF-8").toLowerCase() + "/page/" + page; + } catch (UnsupportedEncodingException e1) { + e1.printStackTrace(); + } + Document doc = getDocument(url); + Element content = doc.getElementsByClass("content").get(0); + if (content != null) { + Elements articles = content.getElementsByTag("article"); + if (articles != null) + for (int i = 0; i < articles.size(); i++) { + String link = articles.get(i).getElementsByTag("a").get(0).attr("href"); + String name = articles.get(i).getElementsByTag("a").get(0).ownText(); + String time = articles.get(i).getElementsByClass("time").get(0).ownText(); + time = time.split(" ")[time.split(" ").length - 1]; + NYouHuiGoods goods = new NYouHuiGoods(); + goods.setName(name); + goods.setSourceUrl(link); + goods.setPublishTime(time); + goodsList.add(goods); + } + } + return goodsList; + } + + public static NYouHuiGoods getGoodsDetail(String url) { + NYouHuiGoods goods = new NYouHuiGoods(); + Document doc = getDocument(url); + Element article = doc.getElementsByClass("article-content").get(0); + Elements ps = article.getElementsByTag("p"); + String text = ""; + for (int i = 0; i < ps.size(); i++) { + // 绉婚櫎鍥剧墖 + Elements imgList = ps.get(i).getElementsByTag("img"); + for (int j = 0; j < imgList.size(); j++) + imgList.get(j).remove(); + text += ps.get(i).html().replace("<br>", "\n") + "\n"; + } + + text=text.trim(); + + String regex = "(https://u\\.jd\\.com/)[0-9A-Za-z]{1,20}"; + Pattern pattern = Pattern.compile(regex); + Matcher m = pattern.matcher(text); + List<String> urlList = new ArrayList<>(); + while (m.find()) { + urlList.add(m.group()); + } + goods.setLinkList(urlList); + goods.setDesc(text.replaceAll(regex, "[閾炬帴]")); + goods.setName(doc.getElementsByClass("article-title").get(0).getElementsByTag("a").get(0).ownText()); + String time = doc.getElementsByClass("article-meta").get(0).getElementsByTag("li").get(0).ownText().trim(); + goods.setPublishTime(time.split(" ")[time.split(" ").length - 1]); + goods.setSourceUrl(url); + return goods; + } + +} -- Gitblit v1.8.0