package com.yeshi.buwan.util.news;
|
|
import java.io.IOException;
|
import java.net.URLDecoder;
|
import java.util.ArrayList;
|
import java.util.List;
|
|
import javax.annotation.Resource;
|
|
import org.apache.commons.httpclient.HttpClient;
|
import org.apache.commons.httpclient.HttpException;
|
import org.apache.commons.httpclient.methods.GetMethod;
|
import org.jsoup.Jsoup;
|
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Element;
|
import org.jsoup.select.Elements;
|
import org.springframework.stereotype.Component;
|
|
import com.yeshi.buwan.domain.news.News;
|
import com.yeshi.buwan.domain.news.NewsImage;
|
import com.yeshi.buwan.domain.news.NewsType;
|
import com.yeshi.buwan.domain.news.NewsTypeContent;
|
import com.yeshi.buwan.service.imp.news.SouGouService;
|
import com.yeshi.buwan.util.StringUtil;
|
|
@Component
|
public class SouGouParser {
|
@Resource
|
private SouGouService souGouService;
|
|
public static String REMEN = "http://weixin.sogou.com/wapindex/wap/0612/wap_0/#.html";// 热门
|
public static String TUIJIAN = "http://weixin.sogou.com/wapindex/wap/0612/wap_1/#.html";// 推荐
|
public static String DUANZISHOU = "http://weixin.sogou.com/wapindex/wap/0612/wap_2/#.html";// 段子手
|
public static String YANGSHENGTANG = "http://weixin.sogou.com/wapindex/wap/0612/wap_3/#.html";// 养生堂
|
public static String SIFANGHUA = "http://weixin.sogou.com/wapindex/wap/0612/wap_4/#.html";// 私房话
|
public static String BAGUAJING = "http://weixin.sogou.com/wapindex/wap/0612/wap_5/#.html";// 八卦精
|
public static String BAISHITONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_6/#.html";// 百事通
|
public static String CAIJINGMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_7/#.html";// 财经迷
|
public static String QICHEMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_8/#.html";// 汽车迷
|
public static String KEJIKA = "http://weixin.sogou.com/wapindex/wap/0612/wap_9/#.html";// 科技卡
|
public static String WANRENMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_10/#.html";// 万人迷
|
public static String BAOBAOKONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_11/#.html";// 宝宝控
|
public static String DIANZHANDANG = "http://weixin.sogou.com/wapindex/wap/0612/wap_12/#.html";// 点赞党
|
public static String LVXINGJIA = "http://weixin.sogou.com/wapindex/wap/0612/wap_13/#.html";// 旅行家
|
public static String BAIGUJING = "http://weixin.sogou.com/wapindex/wap/0612/wap_14/#.html";// 白骨精
|
public static String MEISHIJIA = "http://weixin.sogou.com/wapindex/wap/0612/wap_15/#.html";// 美食家
|
public static String GUJINTONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_16/#.html";// 古今同
|
public static String KAOZHENGDANG = "http://weixin.sogou.com/wapindex/wap/0612/wap_17/#.html";// 考证党
|
public static String XINGZUOKONG = "http://weixin.sogou.com/wapindex/wap/0612/wap_18/#.html";// 星座控
|
public static String TIYUMI = "http://weixin.sogou.com/wapindex/wap/0612/wap_19/#.html";// 体育迷
|
|
public static String[] URLS = { REMEN, TUIJIAN, DUANZISHOU, YANGSHENGTANG, SIFANGHUA, BAGUAJING, BAISHITONG,
|
CAIJINGMI, QICHEMI, KEJIKA, WANRENMI, BAOBAOKONG, DIANZHANDANG, LVXINGJIA, BAIGUJING, MEISHIJIA, GUJINTONG,
|
KAOZHENGDANG, XINGZUOKONG, TIYUMI };
|
|
public static String getUrl(int page, String url) {
|
return url.replace("#.html", (page - 1) + ".html");
|
}
|
|
private static String getType(String url) {
|
for (int i = 0; i < URLS.length; i++) {
|
if (url.equalsIgnoreCase(URLS[i]))
|
return (i + 2) + "";
|
}
|
return null;
|
}
|
|
public void deleteUnavailable() {
|
souGouService.deleteBeforeTime(System.currentTimeMillis() - 1000 * 60 * 60 * 2);
|
}
|
|
public void startParse() {
|
for (String mu : URLS)
|
for (int i = 1; i < 5; i++) {
|
NewsType type = new NewsType(getType(mu));
|
List<News> list = parseWeiXinList(getUrl(i, mu));
|
for (News wx : list) {
|
wx.setId(souGouService.addWeiXinArticle(wx, wx.getImgList()));
|
if (!StringUtil.isNullOrEmpty(wx.getId())) {
|
NewsTypeContent aw = new NewsTypeContent();
|
aw.setContent(wx);
|
aw.setType(type);
|
souGouService.addWeiXinArticleTypeContent(aw);
|
}
|
}
|
}
|
// 将热门前3导入进去到封面
|
deleteUnavailable();
|
souGouService.updateFoundNews();
|
}
|
|
public List<News> parseWeiXinList(String url) {
|
try {
|
Document doc = Jsoup.connect(url).userAgent("Dalvik/2.1.0 (Linux; U; Android 5.0.2; MI 2S MIUI/5.12.10)")
|
.timeout(20 * 1000).get();
|
Elements els = doc.getElementsByTag("li");
|
List<News> list = new ArrayList<News>();
|
for (int i = 0; i < els.size(); i++) {
|
Element el = els.get(i);
|
String key = el.attr("d");
|
String href = el.getElementsByTag("a").get(0).attr("href");
|
String src = el.getElementsByTag("img").get(0).attr("src");
|
|
News wx = new News();
|
wx.setMarkid(key);
|
|
if (el.getElementsByAttributeValue("class", "i1") == null
|
|| el.getElementsByAttributeValue("class", "i1").size() == 0) {
|
wx.setContentType(1);
|
} else
|
wx.setContentType(3);
|
wx.setUrl(href);
|
List<NewsImage> imgList = new ArrayList<NewsImage>();
|
imgList.add(new NewsImage(null, URLDecoder.decode(src, "UTF-8"), System.currentTimeMillis() + "", 1));
|
wx.setImgList(imgList);
|
wx = parseArticle(wx);
|
wx.setMarkid(key);
|
wx.setShowType(3);
|
wx.setShow(true);
|
list.add(wx);
|
}
|
return list;
|
} catch (Exception e) {
|
e.printStackTrace();
|
}
|
return new ArrayList<News>();
|
}
|
|
public News parseArticle(News wx) {
|
News nwx = parseArticle(wx.getUrl());
|
nwx.setImgList(wx.getImgList());
|
return nwx;
|
}
|
|
// 文章解析
|
public News parseArticle(String url) {
|
News wx = new News();
|
|
Document doc = null;
|
try {
|
doc = Jsoup.connect(url).userAgent("Dalvik/2.1.0 (Linux; U; Android 5.0.2; MI 2S MIUI/5.12.10)")
|
.timeout(20000).get();
|
} catch (IOException e) {
|
e.printStackTrace();
|
}
|
Element root = doc.getElementById("img-content");// js_content
|
String title = root.getElementById("activity-name").text().trim();
|
String date = root.getElementById("post-date").text().trim();//
|
String user = root.getElementById("post-user").text().trim();//
|
wx.setCreatetime(System.currentTimeMillis() + "");
|
wx.setDate(date);
|
wx.setTitle(title);
|
wx.setUrl(url);
|
wx.setFrom(user);
|
return wx;
|
}
|
|
public static String get(String url) {
|
HttpClient client = new HttpClient();
|
GetMethod method = new GetMethod(url);
|
try {
|
method.setRequestHeader("User-Agent", "Dalvik/2.1.0 (Linux; U; Android 5.0.2; MI 2S MIUI/5.12.10)");
|
client.executeMethod(method);
|
return method.getResponseBodyAsString();
|
} catch (HttpException e) {
|
e.printStackTrace();
|
} catch (IOException e) {
|
e.printStackTrace();
|
}
|
return "";
|
}
|
}
|