سأشرح خطوة بخطوة كيفية الزحف إلى بيانات البيان المالي من نظام الإفصاح الإلكتروني DART. نستخدم مكتبات requests و BeautifulSoup و pandas الخاصة بـ Python. ومع ذلك، يرجى ملاحظة تغييرات هيكل موقع الويب عند الاستخدام الفعلي، ويجب توخي الحذر لأن الطلبات المفرطة قد تثقل كاهل الخادم.
pip install requests beautifulsoup4 pandas openpyxl
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
<br/>
# Search Criteria (e.g., Samsung Electronics (005930) Annual Report)
COMPANY_CODE = "005930" # Stock Code
START_DATE = "20230101" # Search Start Date (YYYYMMDD)
END_DATE = "20231231" # Search End Date (YYYYMMDD)
REPORT_TYPE = "A001" # A001: Annual Report, A002: Semi-Annual Report, A003: Quarterly Report
<br/>
# DART Disclosure Search URL
SEARCH_URL = "http://dart.fss.or.kr/dsab001/search.ax"
def get_report_list():
"""Function to fetch the list of DART disclosure reports"""
params = {
"currentPage": 1,
"maxResults": 10,
"businessCode": COMPANY_CODE,
"startDate": START_DATE,
"endDate": END_DATE,
"reportName": REPORT_TYPE
}
response = requests.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.select(".table_list tr")[1:] # Extract rows excluding the header
def extract_excel_url(report_url):
"""Function to extract the Excel file URL from the report page"""
response = requests.get(report_url)
soup = BeautifulSoup(response.text, 'html.parser')
excel_link = soup.select_one("a[href*='download.xbrl']")
if excel_link:
return urljoin(report_url, excel_link['href'])
return None
def download_excel(url):
"""Function to download the Excel file and convert it into a DataFrame"""
response = requests.get(url)
with open("temp.xlsx", "wb") as f:
f.write(response.content)
return pd.read_excel("temp.xlsx", engine='openpyxl')
<br/>
# Main Execution
if __name__ == "__main__":
reports = get_report_list()
for idx, report in enumerate(reports[:3]): # Process up to 3 reports
# Extract report title and link
title = report.select_one("td:nth-child(3) a").text.strip()
report_url = urljoin(SEARCH_URL, report.select_one("td:nth-child(3) a")['href'])
print(f"[{idx+1}] Extracting data from {title}...")
# Extract Excel file URL and download
excel_url = extract_excel_url(report_url)
if excel_url:
df = download_excel(excel_url)
print(df.head()) # Check the data
else:
print("Excel file not found.")
استنادًا إلى هذا الرمز، يمكنك تنفيذ معالجة بيانات إضافية ومنطق تحليل كمي.
0