"""
|
同花顺网站爬虫
|
"""
|
|
import json
|
import os
|
import time
|
from urllib import parse
|
import pymongo
|
import requests
|
from selenium import webdriver
|
import scrapy
|
from scrapy import cmdline
|
from selenium.webdriver import ActionChains
|
from selenium.webdriver.common.by import By
|
import mysql_data
|
|
|
def save(dn_name, datas):
|
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
|
db = client.gp
|
collections = db[dn_name]
|
for data in datas:
|
collections.delete_one({"_id": data["_id"]})
|
collections.insert_one(data)
|
|
|
def remove_all(dn_name):
|
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
|
db = client.gp
|
collections = db[dn_name]
|
collections.delete_many(filter="")
|
|
|
def industry():
|
cookie = "ta_random_userid=uppxpgjh71; WafStatus=0; cid=c9859f3a7fef885083cabd7fec6aef471654074712; ComputerID=c9859f3a7fef885083cabd7fec6aef471654074712; PHPSESSID=0d9309b0017225861bbe1446c738cc28; user=MDptb181NTkzODg0ODA6Ok5vbmU6NTAwOjU2OTM4ODQ4MDo1LDEsNDA7NiwxLDQwOzcsMTExMTExMTExMTEwLDQwOzgsMTExMTAxMTEwMDAwMTExMTEwMDEwMDEwMDEwMDAwMDAsNDA7MzMsMDAwMTAwMDAwMDAwLDI3OTszNiwxMDAxMTExMTAwMDAxMTAwMTAxMTExMTEsMjc5OzQ2LDAwMDAxMTExMTAwMDAwMTExMTExMTExMSwyNzk7NTEsMTEwMDAwMDAwMDAwMDAwMCwyNzk7NTgsMDAwMDAwMDAwMDAwMDAwMDEsMjc5Ozc4LDEsMjc5Ozg3LDAwMDAwMDAwMDAwMDAwMDAwMDAxMDAwMCwyNzk7NDQsMTEsNDA7MSwxMDEsNDA7MiwxLDQwOzMsMSw0MDsxMDIsMSw0MDoyNzo6OjU1OTM4ODQ4MDoxNjU3MTgzODQ3Ojo6MTYwOTE2NDE4MDo0MDAxNTM6MDoxOGM1M2ViM2MzMTUyYzFlZmMxMTY3MjcyZTExOTczY2U6ZGVmYXVsdF80OjE=; userid=559388480; u_name=mo_559388480; escapename=mo_559388480; ticket=08b7e8531104dda481d989ace9041a41; user_status=0; utk=c1489e81726ce1d382920df78aa3726e; v=A9d8Z87veejuOP1khsuI644BZkAkHKBzheEvrykE8_QOZfm-Mew7zpXAv1E6"
|
refer = "http://www.iwencai.com/stockpick/search?typed=0&preParams=&ts=1&f=1&qs=result_original&selfsectsn=&querytype=stock&searchfilter=&tid=stockpick&w=%E8%A1%8C%E4%B8%9A"
|
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
|
headers = {
|
"User-Agent": userAgent,
|
"Referer": refer,
|
"Cookie": cookie
|
}
|
|
url = "http://www.iwencai.com/stockpick/cache?token=aa18c9c043a235f62ca1d6e54ce8ebf9&p={}&perpage=30&showType=[%22%22,%22%22,%22onTable%22,%22onTable%22,%22onTable%22,%22onTable%22]"
|
for i in range(1, 4):
|
link = url.format(i)
|
resp = requests.get(link, headers=headers)
|
f = open("C:\\Users\\Administrator\\Desktop\\ocr\\行业\\{}.txt".format(i), 'x')
|
f.write(resp.text)
|
print(link)
|
|
|
# 问财页面:http://www.iwencai.com/unifiedwap/result?w=%E7%94%B5%E5%8A%9B&querytype=stock
|
def industry_code(name):
|
path = "C:\\Users\\Administrator\\Desktop\\ocr\\行业\\{}.txt".format(name)
|
if os.path.exists(path):
|
return None
|
time.sleep(10)
|
print(name)
|
cookie = "ta_random_userid=uppxpgjh71; WafStatus=0; cid=c9859f3a7fef885083cabd7fec6aef471654074712; ComputerID=c9859f3a7fef885083cabd7fec6aef471654074712; PHPSESSID=0d9309b0017225861bbe1446c738cc28; user=MDptb181NTkzODg0ODA6Ok5vbmU6NTAwOjU2OTM4ODQ4MDo1LDEsNDA7NiwxLDQwOzcsMTExMTExMTExMTEwLDQwOzgsMTExMTAxMTEwMDAwMTExMTEwMDEwMDEwMDEwMDAwMDAsNDA7MzMsMDAwMTAwMDAwMDAwLDI3OTszNiwxMDAxMTExMTAwMDAxMTAwMTAxMTExMTEsMjc5OzQ2LDAwMDAxMTExMTAwMDAwMTExMTExMTExMSwyNzk7NTEsMTEwMDAwMDAwMDAwMDAwMCwyNzk7NTgsMDAwMDAwMDAwMDAwMDAwMDEsMjc5Ozc4LDEsMjc5Ozg3LDAwMDAwMDAwMDAwMDAwMDAwMDAxMDAwMCwyNzk7NDQsMTEsNDA7MSwxMDEsNDA7MiwxLDQwOzMsMSw0MDsxMDIsMSw0MDoyNzo6OjU1OTM4ODQ4MDoxNjU3MTgzODQ3Ojo6MTYwOTE2NDE4MDo0MDAxNTM6MDoxOGM1M2ViM2MzMTUyYzFlZmMxMTY3MjcyZTExOTczY2U6ZGVmYXVsdF80OjE%3D; userid=559388480; u_name=mo_559388480; escapename=mo_559388480; ticket=08b7e8531104dda481d989ace9041a41; user_status=0; utk=c1489e81726ce1d382920df78aa3726e; v=A07liJ-coH6fyxRaeGmBfE_Cny8VzxF7JJbGpXiFu10MxeAR4F9i2fQjFpxL"
|
refer = "http://www.iwencai.com/unifiedwap/result?w={}&querytype=stock".format(parse.quote(name))
|
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
|
headers = {
|
"User-Agent": userAgent,
|
"Referer": refer,
|
"Cookie": cookie,
|
"Content-Type": 'application/json;charset=UTF-8',
|
"hexin-v": "A07liJ-coH6fyxRaeGmBfE_Cny8VzxF7JJbGpXiFu10MxeAR4F9i2fQjFpxL",
|
"Origin": "http://www.iwencai.com"
|
}
|
link = "http://www.iwencai.com/customized/chart/get-robot-data"
|
data = {
|
"question": name,
|
"perpage": "400",
|
"page": 1,
|
"secondary_intent": "stock",
|
"log_info": "{\"input_type\":\"click\"}",
|
"source": "Ths_iwencai_Xuangu",
|
"version": "2.0",
|
"query_area": "",
|
"block_list": "",
|
"add_info": "{\"urp\":{\"scene\":1,\"company\":1,\"business\":1},\"contentType\":\"json\",\"searchInfo\":true}"
|
}
|
resp = requests.post(link, json=data, headers=headers)
|
f = open(path, 'w')
|
f.write(resp.text)
|
print(resp.text)
|
|
|
def parse_industry():
|
count = 0
|
totals = []
|
for i in range(1, 4):
|
f = open("C:\\Users\\Administrator\\Desktop\\ocr\\行业\\{}.txt".format(i), 'r')
|
data = f.readline()
|
dict = json.loads(data)
|
results = dict["result"]
|
for r in results:
|
count += 1
|
if count <= 76:
|
totals.append(r[1])
|
sorted_by_second = sorted(totals, key=lambda tup: tup[1])
|
return totals
|
|
|
def parse_code(name):
|
f = open("C:\\Users\\Administrator\\Desktop\\ocr\\行业\\{}.txt".format(name), 'r')
|
data = f.readline()
|
dict = json.loads(data)
|
if dict["status_code"] != 0:
|
return None
|
|
results = dict["data"]["answer"][0]["txt"][0]["content"]["components"][0]["data"]["datas"]
|
data_list = []
|
for i in results:
|
d = {'_id': i["code"], 'first_industry': i["所属同花顺行业"].split('-')[0],
|
'second_industry': i["所属同花顺行业"].split('-')[1], 'three_industry': i["所属同花顺行业"].split('-')[2]}
|
data_list.append(d)
|
# print(d)
|
return data_list
|
|
|
def check_charset(file_path):
|
import chardet
|
with open(file_path, "rb") as f:
|
data = f.read(4)
|
charset = chardet.detect(data)['encoding']
|
return charset
|
|
|
def parse_extra():
|
path = "C:\\Users\\Administrator\\Desktop\\ocr\\行业\\extra.txt"
|
charset = check_charset(path)
|
print(charset)
|
f = open(path, 'r', encoding=charset)
|
datas = f.readlines()
|
data_list = []
|
for data in datas:
|
dict = json.loads(data)
|
data = (dict["result"])
|
for i in data:
|
d = {'_id': i[0].split(".")[0], 'first_industry': i[4].split('-')[0],
|
'second_industry': i[4].split('-')[1], 'three_industry': i[4].split('-')[2]}
|
data_list.append(d)
|
print(d)
|
return data_list
|
|
|
# industrys = parse_industry()
|
# for i in industrys:
|
# list = parse_code(i)
|
# save("ths-industry-codes", list)
|
# print(i, len(list))
|
|
|
# print(i)
|
# try:
|
# save("ths-industry-codes", parse_code(i))
|
# except Exception as e:
|
# print("出错:", i)
|
|
# industry_code("计算机应用")
|
|
# list = parse_code("计算机应用")
|
# print(len(list))
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
def get_industry_sel(industry_names):
|
# 先启动浏览器
|
options = Options()
|
options.add_argument("--disable-blink-features")
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
driver = webdriver.Chrome(options=options)
|
driver.get(
|
"https://www.iwencai.com/unifiedwap/result?w=%E8%AE%A1%E7%AE%97%E6%9C%BA%E8%AE%BE%E5%A4%87&querytype=stock")
|
cookie_str = "ta_random_userid=uppxpgjh71; WafStatus=0; cid=c9859f3a7fef885083cabd7fec6aef471654074712; ComputerID=c9859f3a7fef885083cabd7fec6aef471654074712; other_uid=Ths_iwencai_Xuangu_64rlhgns9vf70ckd4tc516sfxz9ahr2w; PHPSESSID=54ad5acbc90bd750be5b087c8925c2f3; user_status=0; user=MDpteF81MzE1MDA1Njc6Ok5vbmU6NTAwOjU0MTUwMDU2Nzo3LDExMTExMTExMTExLDQwOzQ0LDExLDQwOzYsMSw0MDs1LDEsNDA7MSwxMDEsNDA7MiwxLDQwOzMsMSw0MDs1LDEsNDA7OCwwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMSw0MDsxMDIsMSw0MDoxNjo6OjUzMTUwMDU2NzoxNjU4MTIzNjAyOjo6MTU5NDE4NTEyMDoyMjcxOTg6MDoxMGUwNWQxNGNiZGYxZDc0MDA4YWRlMzQ0YTcwY2JjZDY6ZGVmYXVsdF80OjA%3D; userid=531500567; u_name=mx_531500567; escapename=mx_531500567; ticket=46c040138071be5298d0c50870b15b5c; utk=fb8a94845bcad86cb5880b4436d1e8f5; v=AwWu7XDRy1CB3u8Mgnda0cjHFEo6wrlUA3adqAdqwTxLnis0zxLJJJPGrXyU"
|
cookies = cookie_str.split(";")
|
for c in cookies:
|
driver.add_cookie(cookie_dict={"name": (c.split("=")[0].strip()), "value": str(c.split("=")[1].strip())})
|
time.sleep(2)
|
# 浮层小时
|
try:
|
ActionChains(driver).click(on_element=driver.find_element(by=By.CLASS_NAME, value="prompt-shade")).perform()
|
except:
|
print("")
|
|
industry_list = []
|
for industry_name in industry_names:
|
driver.get(
|
"https://www.iwencai.com/unifiedwap/result?w={}&querytype=stock".format(parse.quote(industry_name)))
|
time.sleep(2)
|
# 查找分页
|
|
pages = driver.find_element(by=By.CLASS_NAME, value="iwc-table-wrapper-outer").find_element(by=By.CLASS_NAME,
|
value="pager").find_elements(
|
by=By.CLASS_NAME,
|
value="page-item")
|
time.sleep(1)
|
data_list = []
|
for page in pages:
|
try:
|
page.find_element(
|
by=By.TAG_NAME, value="a").click()
|
time.sleep(2)
|
except:
|
print("")
|
|
scroll_headers = driver.find_elements(by=By.CLASS_NAME, value="iwc-table-content")[0].find_element(
|
by=By.CLASS_NAME,
|
value="iwc-table-scroll").find_element(
|
by=By.TAG_NAME, value="ul").find_elements(by=By.TAG_NAME, value="li")
|
index = 0
|
for el in scroll_headers:
|
if el.text == '所属同花顺行业':
|
break
|
else:
|
index = index + 1
|
|
scroll_content = driver.find_element(by=By.CLASS_NAME, value="iwc-table-content").find_element(
|
by=By.CLASS_NAME,
|
value="iwc-table-scroll").find_element(
|
by=By.TAG_NAME, value="table").find_elements(by=By.TAG_NAME, value="tr")
|
|
for el in scroll_content:
|
code = el.find_elements(
|
by=By.TAG_NAME, value="td")[2].text
|
industry = el.find_elements(
|
by=By.TAG_NAME, value="td")[4 + index].text
|
industry_list.append({"code": code, "industry": industry})
|
d = {'_id': code, 'first_industry': industry.split('-')[0],
|
'second_industry': industry.split('-')[1], 'three_industry': industry.split('-')[2]}
|
data_list.append(d)
|
print(code, industry)
|
save("ths-industry-codes", data_list)
|
|
|
if __name__ == '__main__':
|
# cmdline.execute('scrapy crawl baidu'.split())
|
# industrys = parse_industry()
|
# get_industry_sel(industrys)
|
_list = []
|
with open('C:\\Users\\Administrator\\Desktop\\行业代码.txt', encoding='utf-8') as file:
|
content = file.readlines()
|
for line in content:
|
str = line.strip()
|
name = str.split(":")[0].strip()
|
code = str.split(":")[1].strip()
|
_list.append({"_id": name, "first_code": code})
|
|
#mongo_data.save("ths-industry", _list)
|