query_table

by wukan1986
MIT License
OverviewInspectSchema Related Servers Reviews Score
query_table
sites
"""
东方财富 条件选股
https://xuangu.eastmoney.com/

1. 部分数据中包含中文单位，如万亿等，导致无法转换为数字，如VOLUME
2. 东财翻页需要提前手工登录
3. 东财翻页是页面已经翻了，然后等数据来更新，懒加载
"""
import re

import pandas as pd
from loguru import logger
from playwright.async_api import Page

from query_table.enums import QueryType

# 查询结果
# 'https://np-pick-b.eastmoney.com/api/smart-tag/stock/v3/pw/search-code'
# 'https://np-pick-b.eastmoney.com/api/smart-tag/fund/v3/pw/search-code'
# 'https://np-pick-b.eastmoney.com/api/smart-tag/hk/v3/pw/search-code'
# 'https://np-pick-b.eastmoney.com/api/smart-tag/cb/v3/pw/search-code'
# 'https://np-pick-b.eastmoney.com/api/smart-tag/etf/v3/pw/search-code'
# 'https://np-pick-b.eastmoney.com/api/smart-tag/bk/v3/pw/search-code'
_PAGE0_ = 'https://np-pick-b.eastmoney.com/api/smart-tag'
_PAGE1_ = 'https://np-pick-b.eastmoney.com/api/smart-tag/{}/v3/pw/search-code'

_type_ = {
    QueryType.CNStock: 'stock',
    QueryType.Fund: 'fund',
    QueryType.HKStock: 'hk',
    QueryType.ConBond: 'cb',
    QueryType.ETF: 'etf',
    QueryType.Board: 'bk',
}


def convert_type(type):
    if type == 'Double':
        return float
    if type == 'String':
        return str
    if type == 'Long':
        return int
    if type == 'Boolean':
        return bool
    if type == 'INT':  # TODO 好像未出现过
        return int
    return type


class Pagination:
    def __init__(self):
        self.datas = {}
        self.pageNo = 1
        self.pageSize = 100
        self.total = 1024
        self.columns = []
        self.datas = {}

    def reset(self):
        self.datas = {}

    def update(self, pageNo, pageSize, total, columns, dataList):
        self.pageNo = pageNo
        self.pageSize = pageSize
        self.total = total
        self.columns = columns
        self.datas[self.pageNo] = dataList

    def has_next(self, max_page):
        c1 = self.pageNo * self.pageSize < self.total
        c2 = self.pageNo < max_page
        return c1 & c2

    def current(self):
        return self.pageNo

    def get_list(self):
        datas = []
        for k, v in self.datas.items():
            datas.extend(v)
        return datas

    def get_dataframe(self):
        columns = {x['key']: x['title'] for x in self.columns}
        dtypes = {x['key']: convert_type(x['dataType']) for x in self.columns}

        df = pd.DataFrame(self.get_list())
        for k, v in dtypes.items():
            if k == 'SERIAL':
                df[k] = df[k].astype(int)
                continue
            if isinstance(v, str):
                logger.info("未识别的数据类型 {}:{}", k, v)
                continue
            try:
                df[k] = df[k].astype(v)
            except ValueError:
                logger.info("转换失败 {}:{}", k, v)

        return df.rename(columns=columns)


P = Pagination()


def search_code(json_data):
    total = json_data['data']['result']['total']
    columns = json_data['data']['result']['columns']
    dataList = json_data['data']['result']['dataList']
    return total, columns, dataList


async def on_response(response):
    # if not response.url.startswith(_PAGE0_):
    #     return
    post_data_json = response.request.post_data_json
    pageNo = post_data_json['pageNo']
    pageSize = post_data_json['pageSize']
    P.update(pageNo, pageSize, *search_code(await response.json()))


async def query(page: Page,
                q: str = "收盘价>100元",
                type_: QueryType = 'stock',
                max_page: int = 5) -> pd.DataFrame:
    type = _type_.get(type_, None)
    assert type is not None, f"不支持的类型:{type_}"
    url = _PAGE1_.format(type)

    await page.route(re.compile(r'.*\.(?:jpg|jpeg|png|gif|webp)(?:$|\?)'), lambda route: route.abort())

    P.reset()
    async with page.expect_response(url) as response_info:
        # 这里不用处理输入编码问题
        await page.goto(f"https://xuangu.eastmoney.com/Result?q={q}&type={type}", wait_until="load")
    await on_response(await response_info.value)

    while P.has_next(max_page):
        logger.info("当前页为:{}, 点击`下一页`", P.current())

        # 这种写法解决了懒加载问题
        async with page.expect_response(url) as response_info:
            await page.get_by_role("button", name="下一页").click()
        await on_response(await response_info.value)

    return P.get_dataframe()