admin
2019-12-24 8dc8133fb93405c6fc34c9c3c6c6bbce09ebe7f0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package com.yeshi.fanli.util.goods.jd;
 
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
import com.yeshi.fanli.dao.goods.jd.NYouHuiGoods;
 
/**
 * 京东内优惠商品爬取 https://www.n-youhui.com
 * 
 * @author Administrator
 *
 */
public class NYouHuiUtil {
 
    private static Document getDocument(String url) {
        Document doc = null;
        try {
            doc = Jsoup.connect(url).timeout(20000)
                    .userAgent(
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
                    .get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
 
    public static List<String> getClasses() {
        List<String> classList = new ArrayList<>();
        Document doc = getDocument("https://www.n-youhui.com");
        Element classesRoot = doc.getElementsByClass("widget_categories").get(0);
        Elements level0 = classesRoot.getElementsByClass("level-0");
        for (int i = 0; i < level0.size(); i++) {
            String name = level0.get(i).html().replace("&nbsp;", "").replaceAll("\\(([0-9]|,)*\\)", "").trim();
            classList.add(name);
        }
 
        return classList;
    }
 
    /**
     * 
     * @param className
     * @return
     */
    public static List<NYouHuiGoods> listByClassName(String className, int page) {
        List<NYouHuiGoods> goodsList = new ArrayList<>();
        String url = null;
        try {
            url = "https://www.n-youhui.com/" + URLEncoder.encode(className, "UTF-8").toLowerCase() + "/page/" + page;
        } catch (UnsupportedEncodingException e1) {
            e1.printStackTrace();
        }
        Document doc = getDocument(url);
        Element content = doc.getElementsByClass("content").get(0);
        if (content != null) {
            Elements articles = content.getElementsByTag("article");
            if (articles != null)
                for (int i = 0; i < articles.size(); i++) {
                    String link = articles.get(i).getElementsByTag("a").get(0).attr("href");
                    String name = articles.get(i).getElementsByTag("a").get(0).ownText();
                    String time = articles.get(i).getElementsByClass("time").get(0).ownText();
                    time = time.split(" ")[time.split(" ").length - 1];
                    NYouHuiGoods goods = new NYouHuiGoods();
                    goods.setName(name);
                    goods.setSourceUrl(link);
                    goods.setPublishTime(time);
                    goodsList.add(goods);
                }
        }
        return goodsList;
    }
 
    public static NYouHuiGoods getGoodsDetail(String url) {
        NYouHuiGoods goods = new NYouHuiGoods();
        Document doc = getDocument(url);
        Element article = doc.getElementsByClass("article-content").get(0);
        Elements ps = article.getElementsByTag("p");
        String text = "";
        for (int i = 0; i < ps.size(); i++) {
            // 移除图片
            Elements imgList = ps.get(i).getElementsByTag("img");
            for (int j = 0; j < imgList.size(); j++)
                imgList.get(j).remove();
            text += ps.get(i).html().replace("<br>", "\n") + "\n";
        }
        
        text=text.trim();
 
        String regex = "(https://u\\.jd\\.com/)[0-9A-Za-z]{1,20}";
        Pattern pattern = Pattern.compile(regex);
        Matcher m = pattern.matcher(text);
        List<String> urlList = new ArrayList<>();
        while (m.find()) {
            urlList.add(m.group());
        }
        goods.setLinkList(urlList);
        goods.setDesc(text.replaceAll(regex, "[链接]"));
        goods.setName(doc.getElementsByClass("article-title").get(0).getElementsByTag("a").get(0).ownText());
        String time = doc.getElementsByClass("article-meta").get(0).getElementsByTag("li").get(0).ownText().trim();
        goods.setPublishTime(time.split(" ")[time.split(" ").length - 1]);
        goods.setSourceUrl(url);
        return goods;
    }
 
}