package com.yeshi.fanli.util.goods.jd;
|
|
import java.io.IOException;
|
import java.io.UnsupportedEncodingException;
|
import java.net.URLEncoder;
|
import java.util.ArrayList;
|
import java.util.List;
|
import java.util.regex.Matcher;
|
import java.util.regex.Pattern;
|
|
import org.jsoup.Jsoup;
|
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Element;
|
import org.jsoup.select.Elements;
|
|
import com.yeshi.fanli.dao.goods.jd.NYouHuiGoods;
|
|
/**
|
* 京东内优惠商品爬取 https://www.n-youhui.com
|
*
|
* @author Administrator
|
*
|
*/
|
public class NYouHuiUtil {
|
|
private static Document getDocument(String url) {
|
Document doc = null;
|
try {
|
doc = Jsoup.connect(url).timeout(20000)
|
.userAgent(
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
|
.get();
|
} catch (IOException e) {
|
e.printStackTrace();
|
}
|
return doc;
|
}
|
|
public static List<String> getClasses() {
|
List<String> classList = new ArrayList<>();
|
Document doc = getDocument("https://www.n-youhui.com");
|
Element classesRoot = doc.getElementsByClass("widget_categories").get(0);
|
Elements level0 = classesRoot.getElementsByClass("level-0");
|
for (int i = 0; i < level0.size(); i++) {
|
String name = level0.get(i).html().replace(" ", "").replaceAll("\\(([0-9]|,)*\\)", "").trim();
|
classList.add(name);
|
}
|
|
return classList;
|
}
|
|
/**
|
*
|
* @param className
|
* @return
|
*/
|
public static List<NYouHuiGoods> listByClassName(String className, int page) {
|
List<NYouHuiGoods> goodsList = new ArrayList<>();
|
String url = null;
|
try {
|
url = "https://www.n-youhui.com/" + URLEncoder.encode(className, "UTF-8").toLowerCase() + "/page/" + page;
|
} catch (UnsupportedEncodingException e1) {
|
e1.printStackTrace();
|
}
|
Document doc = getDocument(url);
|
Element content = doc.getElementsByClass("content").get(0);
|
if (content != null) {
|
Elements articles = content.getElementsByTag("article");
|
if (articles != null)
|
for (int i = 0; i < articles.size(); i++) {
|
String link = articles.get(i).getElementsByTag("a").get(0).attr("href");
|
String name = articles.get(i).getElementsByTag("a").get(0).ownText();
|
String time = articles.get(i).getElementsByClass("time").get(0).ownText();
|
time = time.split(" ")[time.split(" ").length - 1];
|
NYouHuiGoods goods = new NYouHuiGoods();
|
goods.setName(name);
|
goods.setSourceUrl(link);
|
goods.setPublishTime(time);
|
goodsList.add(goods);
|
}
|
}
|
return goodsList;
|
}
|
|
public static NYouHuiGoods getGoodsDetail(String url) {
|
NYouHuiGoods goods = new NYouHuiGoods();
|
Document doc = getDocument(url);
|
Element article = doc.getElementsByClass("article-content").get(0);
|
Elements ps = article.getElementsByTag("p");
|
String text = "";
|
for (int i = 0; i < ps.size(); i++) {
|
// 移除图片
|
Elements imgList = ps.get(i).getElementsByTag("img");
|
for (int j = 0; j < imgList.size(); j++)
|
imgList.get(j).remove();
|
text += ps.get(i).html().replace("<br>", "\n") + "\n";
|
}
|
|
text=text.trim();
|
|
String regex = "(https://u\\.jd\\.com/)[0-9A-Za-z]{1,20}";
|
Pattern pattern = Pattern.compile(regex);
|
Matcher m = pattern.matcher(text);
|
List<String> urlList = new ArrayList<>();
|
while (m.find()) {
|
urlList.add(m.group());
|
}
|
goods.setLinkList(urlList);
|
goods.setDesc(text.replaceAll(regex, "[链接]"));
|
goods.setName(doc.getElementsByClass("article-title").get(0).getElementsByTag("a").get(0).ownText());
|
String time = doc.getElementsByClass("article-meta").get(0).getElementsByTag("li").get(0).ownText().trim();
|
goods.setPublishTime(time.split(" ")[time.split(" ").length - 1]);
|
goods.setSourceUrl(url);
|
return goods;
|
}
|
|
}
|