admin
2021-02-06 cad915058c3c53bf328a8ae9ca9bc7de099caba7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
package com.yeshi.buwan.util.news;
 
import java.io.IOException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;
 
import javax.annotation.Resource;
 
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
 
import com.yeshi.buwan.domain.news.News;
import com.yeshi.buwan.domain.news.NewsImage;
import com.yeshi.buwan.domain.news.NewsType;
import com.yeshi.buwan.domain.news.NewsTypeContent;
import com.yeshi.buwan.service.imp.news.SouGouService;
import com.yeshi.buwan.util.StringUtil;
 
@Component
public class SouGouParser {
    @Resource
    private SouGouService souGouService;
 
    public SouGouService getSouGouService() {
        return souGouService;
    }
 
    public void setSouGouService(SouGouService souGouService) {
        this.souGouService = souGouService;
    }
 
    public static String REMEN = "http://weixin.sogou.com/wapindex/wap/0612/wap_0/#.html";// 热门
    public static String TUIJIAN = "http://weixin.sogou.com/wapindex/wap/0612/wap_1/#.html";// 推荐
    public static String DUANZISHOU = "http://weixin.sogou.com/wapindex/wap/0612/wap_2/#.html";// 段子手
    public static String YANGSHENGTANG = "http://weixin.sogou.com/wapindex/wap/0612/wap_3/#.html";// 养生堂
    public static String SIFANGHUA = "http://weixin.sogou.com/wapindex/wap/0612/wap_4/#.html";// 私房话
    public static String BAGUAJING = "http://weixin.sogou.com/wapindex/wap/0612/wap_5/#.html";// 八卦精
    public static String BAISHITONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_6/#.html";// 百事通
    public static String CAIJINGMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_7/#.html";// 财经迷
    public static String QICHEMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_8/#.html";// 汽车迷
    public static String KEJIKA = "http://weixin.sogou.com/wapindex/wap/0612/wap_9/#.html";// 科技卡
    public static String WANRENMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_10/#.html";// 万人迷
    public static String BAOBAOKONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_11/#.html";// 宝宝控
    public static String DIANZHANDANG = "http://weixin.sogou.com/wapindex/wap/0612/wap_12/#.html";// 点赞党
    public static String LVXINGJIA = "http://weixin.sogou.com/wapindex/wap/0612/wap_13/#.html";// 旅行家
    public static String BAIGUJING = "http://weixin.sogou.com/wapindex/wap/0612/wap_14/#.html";// 白骨精
    public static String MEISHIJIA = "http://weixin.sogou.com/wapindex/wap/0612/wap_15/#.html";// 美食家
    public static String GUJINTONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_16/#.html";// 古今同
    public static String KAOZHENGDANG = "http://weixin.sogou.com/wapindex/wap/0612/wap_17/#.html";// 考证党
    public static String XINGZUOKONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_18/#.html";// 星座控
    public static String TIYUMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_19/#.html";// 体育迷
 
    public static String[] URLS = { REMEN, TUIJIAN, DUANZISHOU, YANGSHENGTANG, SIFANGHUA, BAGUAJING, BAISHITONG,
            CAIJINGMI, QICHEMI, KEJIKA, WANRENMI, BAOBAOKONG, DIANZHANDANG, LVXINGJIA, BAIGUJING, MEISHIJIA, GUJINTONG,
            KAOZHENGDANG, XINGZUOKONG, TIYUMI };
 
    public static String getUrl(int page, String url) {
        return url.replace("#.html", (page - 1) + ".html");
    }
 
    private static String getType(String url) {
        for (int i = 0; i < URLS.length; i++) {
            if (url.equalsIgnoreCase(URLS[i]))
                return (i + 2) + "";
        }
        return null;
    }
 
    public void deleteUnavailable() {
        souGouService.deleteBeforeTime(System.currentTimeMillis() - 1000 * 60 * 60 * 2);
    }
 
    public void startParse() {
        for (String mu : URLS)
            for (int i = 1; i < 5; i++) {
                NewsType type = new NewsType(getType(mu));
                List<News> list = parseWeiXinList(getUrl(i, mu));
                for (News wx : list) {
                    wx.setId(souGouService.addWeiXinArticle(wx, wx.getImgList()));
                    if (!StringUtil.isNullOrEmpty(wx.getId())) {
                        NewsTypeContent aw = new NewsTypeContent();
                        aw.setContent(wx);
                        aw.setType(type);
                        souGouService.addWeiXinArticleTypeContent(aw);
                    }
                }
            }
        // 将热门前3导入进去到封面
        deleteUnavailable();
        souGouService.updateFoundNews();
    }
 
    public List<News> parseWeiXinList(String url) {
        try {
            Document doc = Jsoup.connect(url).userAgent("Dalvik/2.1.0 (Linux; U; Android 5.0.2; MI 2S MIUI/5.12.10)")
                    .timeout(20 * 1000).get();
            Elements els = doc.getElementsByTag("li");
            List<News> list = new ArrayList<News>();
            for (int i = 0; i < els.size(); i++) {
                Element el = els.get(i);
                String key = el.attr("d");
                String href = el.getElementsByTag("a").get(0).attr("href");
                String src = el.getElementsByTag("img").get(0).attr("src");
 
                News wx = new News();
                wx.setMarkid(key);
 
                if (el.getElementsByAttributeValue("class", "i1") == null
                        || el.getElementsByAttributeValue("class", "i1").size() == 0) {
                    wx.setContentType(1);
                } else
                    wx.setContentType(3);
                wx.setUrl(href);
                List<NewsImage> imgList = new ArrayList<NewsImage>();
                imgList.add(new NewsImage(null, URLDecoder.decode(src, "UTF-8"), System.currentTimeMillis() + "", 1));
                wx.setImgList(imgList);
                wx = parseArticle(wx);
                wx.setMarkid(key);
                wx.setShowType(3);
                wx.setShow(true);
                list.add(wx);
            }
            return list;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return new ArrayList<News>();
    }
 
    public News parseArticle(News wx) {
        News nwx = parseArticle(wx.getUrl());
        nwx.setImgList(wx.getImgList());
        return nwx;
    }
 
    // 文章解析
    public News parseArticle(String url) {
        News wx = new News();
 
        Document doc = null;
        try {
            doc = Jsoup.connect(url).userAgent("Dalvik/2.1.0 (Linux; U; Android 5.0.2; MI 2S MIUI/5.12.10)")
                    .timeout(20000).get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        Element root = doc.getElementById("img-content");// js_content
        String title = root.getElementById("activity-name").text().trim();
        String date = root.getElementById("post-date").text().trim();//
        String user = root.getElementById("post-user").text().trim();//
        wx.setCreatetime(System.currentTimeMillis() + "");
        wx.setDate(date);
        wx.setTitle(title);
        wx.setUrl(url);
        wx.setFrom(user);
        return wx;
    }
 
    public static String get(String url) {
        HttpClient client = new HttpClient();
        GetMethod method = new GetMethod(url);
        try {
            method.setRequestHeader("User-Agent", "Dalvik/2.1.0 (Linux; U; Android 5.0.2; MI 2S MIUI/5.12.10)");
            client.executeMethod(method);
            return method.getResponseBodyAsString();
        } catch (HttpException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return "";
    }
}