다트 전자공시시스템에서 재무제표 데이터를 크롤링하는 방법을 단계별로 설명드리겠습니다. 파이썬의 requests, BeautifulSoup, pandas 라이브러리를 사용합니다. 단, 실제 사용 시 웹사이트 구조 변경에 유의하시고, 과도한 요청은 서버에 부하를 줄 수 있으므로 주의가 필요합니다.
pip install requests beautifulsoup4 pandas openpyxl
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
# Search Criteria (e.g., Samsung Electronics (005930) Annual Report)
COMPANY_CODE = "005930" # Stock Code
START_DATE = "20230101" # Search Start Date (YYYYMMDD)
END_DATE = "20231231" # Search End Date (YYYYMMDD)
REPORT_TYPE = "A001" # A001: Annual Report, A002: Semi-Annual Report, A003: Quarterly Report
# DART Disclosure Search URL
SEARCH_URL = "http://dart.fss.or.kr/dsab001/search.ax"
def get_report_list():
"""Function to fetch the list of DART disclosure reports"""
params = {
"currentPage": 1,
"maxResults": 10,
"businessCode": COMPANY_CODE,
"startDate": START_DATE,
"endDate": END_DATE,
"reportName": REPORT_TYPE
}
response = requests.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.select(".table_list tr")[1:] # Extract rows excluding the header
def extract_excel_url(report_url):
"""Function to extract the Excel file URL from the report page"""
response = requests.get(report_url)
soup = BeautifulSoup(response.text, 'html.parser')
excel_link = soup.select_one("a[href*='download.xbrl']")
if excel_link:
return urljoin(report_url, excel_link['href'])
return None
def download_excel(url):
"""Function to download the Excel file and convert it into a DataFrame"""
response = requests.get(url)
with open("temp.xlsx", "wb") as f:
f.write(response.content)
return pd.read_excel("temp.xlsx", engine='openpyxl')
# Main Execution
if __name__ == "__main__":
reports = get_report_list()
for idx, report in enumerate(reports[:3]): # Process up to 3 reports
# Extract report title and link
title = report.select_one("td:nth-child(3) a").text.strip()
report_url = urljoin(SEARCH_URL, report.select_one("td:nth-child(3) a")['href'])
print(f"[{idx+1}] Extracting data from {title}...")
# Extract Excel file URL and download
excel_url = extract_excel_url(report_url)
if excel_url:
df = download_excel(excel_url)
print(df.head()) # Check the data
else:
print("Excel file not found.")
이 코드를 기반으로 추가적인 데이터 전처리 및 퀀트 분석 로직을 구현할 수 있습니다.
0