用Python爬取A股全量股票代码与名称(附完整代码与数据清洗技巧)

用Python爬取A股全量股票代码与名称(附完整代码与数据清洗技巧) Python实战A股股票代码与名称全量抓取与智能清洗指南1. 数据抓取基础准备在开始A股数据采集前需要配置专业的Python开发环境。推荐使用Anaconda发行版它预装了数据分析必备的库。以下是关键库的安装命令pip install requests beautifulsoup4 pandas numpy fake-useragent对于需要处理JavaScript渲染页面的情况建议额外安装pip install selenium webdriver-manager核心工具链选择依据requests轻量级HTTP请求库适合静态页面抓取BeautifulSoupHTML解析利器支持多种解析引擎pandas数据清洗与存储的核心工具fake-useragent动态生成请求头有效规避基础反爬重要提示实际操作中建议设置2-3秒的请求间隔避免对目标服务器造成过大压力。金融数据类网站通常对高频访问有严格限制。2. 目标源分析与反爬策略2.1 可靠数据源评估通过对比测试笔者推荐以下几个稳定数据源数据源优点缺点更新频率东方财富网结构清晰需要处理分页交易日实时新浪财经API接口稳定需要签名验证15分钟延迟同花顺数据全面动态加载复杂交易日更新2.2 反爬破解实战方案常见反爬机制应对策略User-Agent检测from fake_useragent import UserAgent headers {User-Agent: UserAgent().random}IP频率限制import time import random time.sleep(random.uniform(1, 3)) # 随机延迟验证码触发from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait def bypass_captcha(driver): try: WebDriverWait(driver, 10).until( lambda x: x.find_element(By.ID, captcha)) # 这里添加自动识别或手动处理逻辑 except: pass3. 网页解析与数据提取3.1 HTML结构解析技巧以东方财富网为例股票列表通常呈现为表格结构。使用BeautifulSoup提取的典型代码def parse_stock_table(html): soup BeautifulSoup(html, lxml) table soup.find(table, {class: stock-table}) results {} for row in table.find_all(tr)[1:]: # 跳过表头 cols row.find_all(td) if len(cols) 2: code cols[0].text.strip() name cols[1].text.strip() results[code] name return results特殊字符处理方案import re def clean_name(name): name re.sub(r[*ST], , name) # 去除特殊标记 name name.replace( , ) # 去除空格 return name.upper() # 统一大写3.2 动态内容抓取方案对于AJAX加载的页面可采用Selenium模拟浏览器from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager driver webdriver.Chrome(ChromeDriverManager().install()) driver.get(https://quote.eastmoney.com/stocklist.html) # 等待动态加载完成 WebDriverWait(driver, 10).until( lambda d: d.find_element_by_css_selector(.stock-table)) html driver.page_source driver.quit()4. 专业级数据清洗流程4.1 异常数据处理规范建立完整的清洗管道def data_cleaning(raw_data): cleaned {} for code, name in raw_data.items(): # 验证股票代码格式 if not re.match(r^[036]\d{5}$, code): continue # 处理特殊名称 name name.split(()[0] # 去除括号内容 name name.replace(*, ).replace(ST, ) # B股特殊处理 if code.startswith(900): name (B股) cleaned[code] name return cleaned4.2 数据验证机制添加自动校验环节def validate_data(stock_dict): valid_codes [] for code in stock_dict: if len(code) 6 and code.isdigit(): prefix code[0] if prefix in (0, 3, 6): valid_codes.append(code) return len(valid_codes) / len(stock_dict) 0.95 # 合格率阈值5. 数据存储与更新策略5.1 多格式存储实现import json import csv def save_data(data, filename): # JSON格式 with open(f{filename}.json, w, encodingutf-8) as f: json.dump(data, f, ensure_asciiFalse) # CSV格式 with open(f{filename}.csv, w, newline, encodingutf-8) as f: writer csv.writer(f) writer.writerow([股票代码, 股票名称]) for code, name in data.items(): writer.writerow([code, name])5.2 增量更新方案import os from datetime import datetime def incremental_update(new_data): timestamp datetime.now().strftime(%Y%m%d) filename fstock_data_{timestamp} if not os.path.exists(archive): os.makedirs(archive) save_data(new_data, farchive/{filename}) # 更新最新数据文件 save_data(new_data, stock_data_latest)6. 完整代码实现与优化6.1 工程化代码结构 A股股票数据采集系统 功能模块 1. 多数据源采集 2. 智能反爬处理 3. 自动化清洗流程 4. 多格式输出 class AShareCrawler: def __init__(self): self.session requests.Session() self.session.headers.update({ User-Agent: UserAgent().random, Accept-Language: zh-CN,zh;q0.9 }) def fetch_from_source(self, source_url): try: resp self.session.get(source_url, timeout10) resp.raise_for_status() return self.parse_response(resp.text) except Exception as e: print(f抓取失败: {str(e)}) return {} def parse_response(self, html): # 实现各网站的解析逻辑 pass def run(self): sources [ https://quote.eastmoney.com/stocklist.html, https://finance.sina.com.cn/stock/ ] all_data {} for url in sources: print(f正在采集: {url}) data self.fetch_from_source(url) all_data.update(data) time.sleep(random.uniform(1, 3)) cleaned_data self.data_cleaning(all_data) self.save_data(cleaned_data) return cleaned_data6.2 性能优化技巧异步请求加速import aiohttp import asyncio async def async_fetch(url): async with aiohttp.ClientSession() as session: async with session.get(url) as response: return await response.text()多线程处理from concurrent.futures import ThreadPoolExecutor def multi_thread_crawl(urls): with ThreadPoolExecutor(max_workers5) as executor: results list(executor.map(fetch_from_source, urls)) return results7. 实战问题解决方案常见问题处理经验网站结构变更应对def adaptive_parse(html): # 尝试多种选择器 parsers [ lambda x: x.find(table, {class: stock-table}), lambda x: x.find(ul, {id: stockList}), lambda x: x.find(div, {class: stock-content}) ] for parser in parsers: result parser(BeautifulSoup(html, lxml)) if result: return extract_data(result) raise ValueError(无法解析页面结构)数据校验规则def validate_stock_code(code): 验证股票代码格式 if not code.isdigit() or len(code) ! 6: return False first_char code[0] return first_char in (0, 3, 6, 9)8. 进阶应用场景8.1 数据可视化分析import matplotlib.pyplot as plt def analyze_stock_types(data): type_dist {主板:0, 创业板:0, 科创板:0} for code in data: if code.startswith(6): type_dist[主板] 1 elif code.startswith(3): type_dist[创业板] 1 elif code.startswith(688): type_dist[科创板] 1 plt.pie(type_dist.values(), labelstype_dist.keys(), autopct%1.1f%%) plt.title(A股市场类型分布) plt.show()8.2 自动化监控系统import schedule import time def daily_job(): crawler AShareCrawler() crawler.run() # 设置每个交易日15:30执行 schedule.every().day.at(15:30).do(daily_job) while True: schedule.run_pending() time.sleep(60)9. 项目扩展方向实时行情对接def get_realtime_quote(stock_code): url fhttps://qt.gtimg.cn/q{stock_code} response requests.get(url) # 解析返回的行情数据 return parse_quote(response.text)基本面数据采集def fetch_financial_report(code): url fhttps://emsec.eastmoney.com/PC_HSF10/FinancialAnalysis/Index?code{code} driver.get(url) # 提取财务报表数据 return extract_financial_data(driver.page_source)10. 生产环境部署建议日志监控配置示例import logging from logging.handlers import RotatingFileHandler def setup_logger(): logger logging.getLogger(stock_crawler) logger.setLevel(logging.INFO) handler RotatingFileHandler( crawler.log, maxBytes10*1024*1024, backupCount5) formatter logging.Formatter( %(asctime)s - %(name)s - %(levelname)s - %(message)s) handler.setFormatter(formatter) logger.addHandler(handler) return logger异常处理增强def safe_crawl(url): try: response requests.get(url, timeout15) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f请求失败: {url} - {str(e)}) return None except Exception as e: logger.exception(f未知错误: {str(e)}) raise