DART Electronics開示システムで財務諸表をクロールする方法を説明します。 Pythonのリクエスト、BeautifulSoup、およびPandas Libraryを使用してください。ただし、実際に使用されている場合はWebサイトの構造を変更することに注意してください。また、サーバーに過剰なリクエストをロードできます。
pip install requests beautifulsoup4 pandas openpyxl
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
# Search Criteria (e.g., Samsung Electronics (005930) Annual Report)
COMPANY_CODE = "005930" # Stock Code
START_DATE = "20230101" # Search Start Date (YYYYMMDD)
END_DATE = "20231231" # Search End Date (YYYYMMDD)
REPORT_TYPE = "A001" # A001: Annual Report, A002: Semi-Annual Report, A003: Quarterly Report
# DART Disclosure Search URL
SEARCH_URL = "http://dart.fss.or.kr/dsab001/search.ax"
def get_report_list():
"""Function to fetch the list of DART disclosure reports"""
params = {
"currentPage": 1,
"maxResults": 10,
"businessCode": COMPANY_CODE,
"startDate": START_DATE,
"endDate": END_DATE,
"reportName": REPORT_TYPE
}
response = requests.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.select(".table_list tr")[1:] # Extract rows excluding the header
def extract_excel_url(report_url):
"""Function to extract the Excel file URL from the report page"""
response = requests.get(report_url)
soup = BeautifulSoup(response.text, 'html.parser')
excel_link = soup.select_one("a[href*='download.xbrl']")
if excel_link:
return urljoin(report_url, excel_link['href'])
return None
def download_excel(url):
"""Function to download the Excel file and convert it into a DataFrame"""
response = requests.get(url)
with open("temp.xlsx", "wb") as f:
f.write(response.content)
return pd.read_excel("temp.xlsx", engine='openpyxl')
# Main Execution
if __name__ == "__main__":
reports = get_report_list()
for idx, report in enumerate(reports[:3]): # Process up to 3 reports
# Extract report title and link
title = report.select_one("td:nth-child(3) a").text.strip()
report_url = urljoin(SEARCH_URL, report.select_one("td:nth-child(3) a")['href'])
print(f"[{idx+1}] Extracting data from {title}...")
# Extract Excel file URL and download
excel_url = extract_excel_url(report_url)
if excel_url:
df = download_excel(excel_url)
print(df.head()) # Check the data
else:
print("Excel file not found.")
このコードに基づいて、追加のデータ前処理と量子分析ロジックを実装できます。
0