| | |
| | | package com.yeshi.fanli.util.goods.jd;
|
| | |
|
| | | import java.io.IOException;
|
| | | import java.util.ArrayList;
|
| | | import java.util.List;
|
| | |
|
| | | import org.jsoup.Jsoup;
|
| | | import org.jsoup.nodes.Document;
|
| | | import org.jsoup.nodes.Element;
|
| | | import org.jsoup.select.Elements;
|
| | |
|
| | | /**
|
| | | * 京东内优惠商品爬取 https://www.n-youhui.com
|
| | | * |
| | | * @author Administrator
|
| | | *
|
| | | */
|
| | | public class NYouHuiUtil {
|
| | | public static List<String> getClasses() {
|
| | | List<String> classList = new ArrayList<>();
|
| | | try {
|
| | | Document doc = Jsoup.connect("https://www.n-youhui.com").timeout(20000)
|
| | | .userAgent(
|
| | | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
|
| | | .get();
|
| | | Element classesRoot = doc.getElementsByClass("widget_categories").get(0);
|
| | | Elements level0 = classesRoot.getElementsByClass("level-0");
|
| | | for (int i = 0; i < level0.size(); i++) {
|
| | | String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim();
|
| | | classList.add(name);
|
| | | }
|
| | | } catch (IOException e) {
|
| | | e.printStackTrace();
|
| | | }
|
| | | return classList;
|
| | | }
|
| | |
|
| | | }
|
| | | package com.yeshi.fanli.util.goods.jd; |
| | | |
| | | import java.io.IOException; |
| | | import java.io.UnsupportedEncodingException; |
| | | import java.net.URLEncoder; |
| | | import java.util.ArrayList; |
| | | import java.util.List; |
| | | import java.util.regex.Matcher; |
| | | import java.util.regex.Pattern; |
| | | |
| | | import org.jsoup.Jsoup; |
| | | import org.jsoup.nodes.Document; |
| | | import org.jsoup.nodes.Element; |
| | | import org.jsoup.select.Elements; |
| | | |
| | | import com.yeshi.fanli.entity.goods.jd.NYouHuiGoods; |
| | | |
| | | /** |
| | | * 京东内优惠商品爬取 https://www.n-youhui.com |
| | | * |
| | | * @author Administrator |
| | | * |
| | | */ |
| | | public class NYouHuiUtil { |
| | | |
| | | private static Document getDocument(String url) { |
| | | Document doc = null; |
| | | try { |
| | | doc = Jsoup.connect(url).timeout(20000) |
| | | .userAgent( |
| | | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36") |
| | | .get(); |
| | | } catch (IOException e) { |
| | | e.printStackTrace(); |
| | | } |
| | | return doc; |
| | | } |
| | | |
| | | public static List<String> getClasses() { |
| | | List<String> classList = new ArrayList<>(); |
| | | Document doc = getDocument("https://www.n-youhui.com"); |
| | | Element classesRoot = doc.getElementsByClass("widget_categories").get(0); |
| | | Elements level0 = classesRoot.getElementsByClass("level-0"); |
| | | for (int i = 0; i < level0.size(); i++) { |
| | | String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim(); |
| | | classList.add(name.replace("/", "-")); |
| | | } |
| | | |
| | | return classList; |
| | | } |
| | | |
| | | /** |
| | | * |
| | | * @param className |
| | | * @return |
| | | */ |
| | | public static List<NYouHuiGoods> listByClassName(String className, int page) { |
| | | List<NYouHuiGoods> goodsList = new ArrayList<>(); |
| | | String url = null; |
| | | try { |
| | | url = "https://www.n-youhui.com/" + URLEncoder.encode(className, "UTF-8").toLowerCase() + "/page/" + page; |
| | | } catch (UnsupportedEncodingException e1) { |
| | | e1.printStackTrace(); |
| | | } |
| | | Document doc = getDocument(url); |
| | | Element content = doc.getElementsByClass("content").get(0); |
| | | if (content != null) { |
| | | Elements articles = content.getElementsByTag("article"); |
| | | if (articles != null) |
| | | for (int i = 0; i < articles.size(); i++) { |
| | | String link = articles.get(i).getElementsByTag("a").get(0).attr("href"); |
| | | String name = articles.get(i).getElementsByTag("a").get(0).ownText(); |
| | | String time = articles.get(i).getElementsByClass("time").get(0).ownText(); |
| | | time = time.split(" ")[time.split(" ").length - 1]; |
| | | NYouHuiGoods goods = new NYouHuiGoods(); |
| | | goods.setName(name); |
| | | goods.setSourceUrl(link); |
| | | goods.setPublishTime(time); |
| | | goodsList.add(goods); |
| | | } |
| | | } |
| | | return goodsList; |
| | | } |
| | | |
| | | public static NYouHuiGoods getGoodsDetail(String url) { |
| | | NYouHuiGoods goods = new NYouHuiGoods(); |
| | | Document doc = getDocument(url); |
| | | Element article = doc.getElementsByClass("article-content").get(0); |
| | | Elements ps = article.getElementsByTag("p"); |
| | | String text = ""; |
| | | for (int i = 0; i < ps.size(); i++) { |
| | | // 移除图片 |
| | | Elements imgList = ps.get(i).getElementsByTag("img"); |
| | | for (int j = 0; j < imgList.size(); j++) |
| | | imgList.get(j).remove(); |
| | | text += ps.get(i).html().replace("<br>", "\n") + "\n"; |
| | | } |
| | | |
| | | text=text.trim(); |
| | | |
| | | String regex = "(https://u\\.jd\\.com/)[0-9A-Za-z]{1,20}"; |
| | | Pattern pattern = Pattern.compile(regex); |
| | | Matcher m = pattern.matcher(text); |
| | | List<String> urlList = new ArrayList<>(); |
| | | while (m.find()) { |
| | | urlList.add(m.group()); |
| | | } |
| | | goods.setLinkList(urlList); |
| | | goods.setDesc(text.replaceAll(regex, "[链接]")); |
| | | goods.setName(doc.getElementsByClass("article-title").get(0).getElementsByTag("a").get(0).ownText()); |
| | | String time = doc.getElementsByClass("article-meta").get(0).getElementsByTag("li").get(0).ownText().trim(); |
| | | goods.setPublishTime(time.split(" ")[time.split(" ").length - 1]); |
| | | goods.setSourceUrl(url); |
| | | return goods; |
| | | } |
| | | |
| | | } |