| | |
| | | package com.yeshi.fanli.util.goods.jd;
|
| | |
|
| | | import java.io.IOException;
|
| | | import java.io.UnsupportedEncodingException;
|
| | | import java.net.URLEncoder;
|
| | | import java.util.ArrayList;
|
| | | import java.util.List;
|
| | | import java.util.regex.Matcher;
|
| | | import java.util.regex.Pattern;
|
| | |
|
| | | import org.jsoup.Jsoup;
|
| | | import org.jsoup.nodes.Document;
|
| | | import org.jsoup.nodes.Element;
|
| | | import org.jsoup.select.Elements;
|
| | |
|
| | | import com.yeshi.fanli.entity.goods.jd.NYouHuiGoods;
|
| | |
|
| | | /**
|
| | | * 京东内优惠商品爬取 https://www.n-youhui.com
|
| | |
| | | *
|
| | | */
|
| | | public class NYouHuiUtil {
|
| | | public static List<String> getClasses() {
|
| | | List<String> classList = new ArrayList<>();
|
| | |
|
| | | private static Document getDocument(String url) {
|
| | | Document doc = null;
|
| | | try {
|
| | | Document doc = Jsoup.connect("https://www.n-youhui.com").timeout(20000)
|
| | | doc = Jsoup.connect(url).timeout(20000)
|
| | | .userAgent(
|
| | | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
|
| | | .get();
|
| | | Element classesRoot = doc.getElementsByClass("widget_categories").get(0);
|
| | | Elements level0 = classesRoot.getElementsByClass("level-0");
|
| | | for (int i = 0; i < level0.size(); i++) {
|
| | | String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim();
|
| | | classList.add(name);
|
| | | }
|
| | | } catch (IOException e) {
|
| | | e.printStackTrace();
|
| | | }
|
| | | return doc;
|
| | | }
|
| | |
|
| | | public static List<String> getClasses() {
|
| | | List<String> classList = new ArrayList<>();
|
| | | Document doc = getDocument("https://www.n-youhui.com");
|
| | | Element classesRoot = doc.getElementsByClass("widget_categories").get(0);
|
| | | Elements level0 = classesRoot.getElementsByClass("level-0");
|
| | | for (int i = 0; i < level0.size(); i++) {
|
| | | String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim();
|
| | | classList.add(name.replace("/", "-"));
|
| | | }
|
| | |
|
| | | return classList;
|
| | | }
|
| | |
|
| | | /**
|
| | | * |
| | | * @param className
|
| | | * @return
|
| | | */
|
| | | public static List<NYouHuiGoods> listByClassName(String className, int page) {
|
| | | List<NYouHuiGoods> goodsList = new ArrayList<>();
|
| | | String url = null;
|
| | | try {
|
| | | url = "https://www.n-youhui.com/" + URLEncoder.encode(className, "UTF-8").toLowerCase() + "/page/" + page;
|
| | | } catch (UnsupportedEncodingException e1) {
|
| | | e1.printStackTrace();
|
| | | }
|
| | | Document doc = getDocument(url);
|
| | | Element content = doc.getElementsByClass("content").get(0);
|
| | | if (content != null) {
|
| | | Elements articles = content.getElementsByTag("article");
|
| | | if (articles != null)
|
| | | for (int i = 0; i < articles.size(); i++) {
|
| | | String link = articles.get(i).getElementsByTag("a").get(0).attr("href");
|
| | | String name = articles.get(i).getElementsByTag("a").get(0).ownText();
|
| | | String time = articles.get(i).getElementsByClass("time").get(0).ownText();
|
| | | time = time.split(" ")[time.split(" ").length - 1];
|
| | | NYouHuiGoods goods = new NYouHuiGoods();
|
| | | goods.setName(name);
|
| | | goods.setSourceUrl(link);
|
| | | goods.setPublishTime(time);
|
| | | goodsList.add(goods);
|
| | | }
|
| | | }
|
| | | return goodsList;
|
| | | }
|
| | |
|
| | | public static NYouHuiGoods getGoodsDetail(String url) {
|
| | | NYouHuiGoods goods = new NYouHuiGoods();
|
| | | Document doc = getDocument(url);
|
| | | Element article = doc.getElementsByClass("article-content").get(0);
|
| | | Elements ps = article.getElementsByTag("p");
|
| | | String text = "";
|
| | | for (int i = 0; i < ps.size(); i++) {
|
| | | // 移除图片
|
| | | Elements imgList = ps.get(i).getElementsByTag("img");
|
| | | for (int j = 0; j < imgList.size(); j++)
|
| | | imgList.get(j).remove();
|
| | | text += ps.get(i).html().replace("<br>", "\n") + "\n";
|
| | | }
|
| | | |
| | | text=text.trim();
|
| | |
|
| | | String regex = "(https://u\\.jd\\.com/)[0-9A-Za-z]{1,20}";
|
| | | Pattern pattern = Pattern.compile(regex);
|
| | | Matcher m = pattern.matcher(text);
|
| | | List<String> urlList = new ArrayList<>();
|
| | | while (m.find()) {
|
| | | urlList.add(m.group());
|
| | | }
|
| | | goods.setLinkList(urlList);
|
| | | goods.setDesc(text.replaceAll(regex, "[链接]"));
|
| | | goods.setName(doc.getElementsByClass("article-title").get(0).getElementsByTag("a").get(0).ownText());
|
| | | String time = doc.getElementsByClass("article-meta").get(0).getElementsByTag("li").get(0).ownText().trim();
|
| | | goods.setPublishTime(time.split(" ")[time.split(" ").length - 1]);
|
| | | goods.setSourceUrl(url);
|
| | | return goods;
|
| | | }
|
| | |
|
| | | }
|