SberHistoryToExcel/load_data.py

from __future__ import annotations

import argparse
import os
import zipfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Iterable
from xml.sax.saxutils import escape

import requests


SBER_OPERATIONS_URL = "https://web-node1.online.sberbank.ru/uoh-bh/v1/operations/list"
DATE_FORMAT = "%d.%m.%YT%H:%M:%S"


@dataclass
class SberMonthHistoryExporter:
    """Downloads Sber operations for one month and writes them into one XLSX file."""

    cookie: str = field(default_factory=lambda: os.getenv("Cookie", ""))
    url: str = SBER_OPERATIONS_URL
    page_size: int = 50
    timeout: int = 30
    max_pages: int = 500

    def __post_init__(self) -> None:
        if not self.cookie:
            raise ValueError("Set Sber auth cookie in the Cookie environment variable.")
        if self.page_size <= 0:
            raise ValueError("page_size must be positive.")

        self.session = requests.Session()

    def fetch_month(self, year: int, month: int) -> list[dict[str, Any]]:
        """Return all operations whose date belongs to the requested month."""
        start, end = self._month_bounds(year, month)
        operations: list[dict[str, Any]] = []
        seen_ids: set[str] = set()

        for page_number in range(self.max_pages):
            offset = page_number * self.page_size
            page = self._fetch_page(offset=offset, month_start=start)
            if not page:
                break

            parsed_dates = []
            for operation in page:
                operation_date = self._parse_operation_date(operation)
                if operation_date is None:
                    continue

                parsed_dates.append(operation_date)
                if start <= operation_date < end:
                    operation_id = self._operation_identity(operation)
                    if operation_id not in seen_ids:
                        operations.append(operation)
                        seen_ids.add(operation_id)

            if len(page) < self.page_size:
                break

            # Sber returns operations from newest to oldest, so going before month start
            # means that the following pages cannot contain the requested month.
            if parsed_dates and min(parsed_dates) < start:
                break
        else:
            raise RuntimeError(f"Reached max_pages={self.max_pages}; export stopped to avoid an endless loop.")

        operations.sort(key=lambda item: self._parse_operation_date(item) or datetime.min)
        return operations

    def save_month(self, year: int, month: int, output_path: str | Path | None = None) -> Path:
        """Fetch a month and save it to one XLSX file."""
        operations = self.fetch_month(year, month)
        path = Path(output_path or f"sber_operations_{year}_{month:02d}.xlsx")
        rows = [self._operation_to_row(operation) for operation in operations]
        self._write_xlsx(path, rows)
        return path

    def _fetch_page(self, offset: int, month_start: datetime) -> list[dict[str, Any]]:
        payload = {
            "paginationOffset": offset,
            "paginationSize": self.page_size,
            "showHidden": False,
            "showNotTransactionBonuses": True,
            "showOpenBanking": True,
            "from": month_start.strftime(DATE_FORMAT),
        }

        response = self.session.post(
            self.url,
            headers=self._headers(),
            json=payload,
            timeout=self.timeout,
        )
        response.raise_for_status()

        data = response.json()
        if not data.get("success", False):
            raise RuntimeError(f"Sber API returned unsuccessful response: {data}")

        operations = data.get("body", {}).get("operations", [])
        if not isinstance(operations, list):
            raise RuntimeError(f"Unexpected operations payload: {operations!r}")

        return operations

    def _headers(self) -> dict[str, str]:
        return {
            "accept": "application/json, text/plain, */*",
            "accept-language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
            "cache-control": "no-cache",
            "content-type": "application/json;charset=UTF-8",
            "origin": "https://online.sberbank.ru",
            "pragma": "no-cache",
            "priority": "u=1, i",
            "referer": "https://online.sberbank.ru/",
            "sec-ch-ua": '"Google Chrome";v="147", "Not.A/Brand";v="8", "Chromium";v="147"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"macOS"',
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-site",
            "user-agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/147.0.0.0 Safari/537.36"
            ),
            "x-requested-with": "XMLHttpRequest",
            "Cookie": self.cookie,
        }

    @staticmethod
    def _month_bounds(year: int, month: int) -> tuple[datetime, datetime]:
        if month < 1 or month > 12:
            raise ValueError("month must be between 1 and 12.")

        start = datetime(year=year, month=month, day=1)
        if month == 12:
            end = datetime(year=year + 1, month=1, day=1)
        else:
            end = datetime(year=year, month=month + 1, day=1)

        return start, end

    @staticmethod
    def _parse_operation_date(operation: dict[str, Any]) -> datetime | None:
        raw_date = operation.get("date")
        if not isinstance(raw_date, str):
            return None

        try:
            return datetime.strptime(raw_date.split(".000")[0], DATE_FORMAT)
        except ValueError:
            return None

    @staticmethod
    def _operation_identity(operation: dict[str, Any]) -> str:
        for key in ("uohId", "externalId", "authorizationDocId"):
            value = operation.get(key)
            if value:
                return str(value)
        return repr(operation)

    @staticmethod
    def _operation_to_row(operation: dict[str, Any]) -> dict[str, Any]:
        operation_amount = operation.get("operationAmount") or {}
        national_amount = operation.get("nationalAmount") or {}
        billing_amount = operation.get("billingAmount") or {}
        state = operation.get("state") or {}
        from_resource = operation.get("fromResource") or {}
        bonuses = operation.get("bonuses") or []

        bonus_income = 0
        if isinstance(bonuses, list):
            bonus_income = sum(
                bonus.get("income", 0)
                for bonus in bonuses
                if isinstance(bonus, dict) and isinstance(bonus.get("income", 0), (int, float))
            )

        return {
            "Дата": operation.get("date", ""),
            "Получатель": operation.get("correspondent", ""),
            "Описание": operation.get("description", ""),
            "Сумма операции": operation_amount.get("amount", ""),
            "Валюта операции": operation_amount.get("currencyCode", ""),
            "Сумма в RUB": national_amount.get("amount", ""),
            "Валюта": national_amount.get("currencyCode", ""),
            "Бонусы Спасибо": bonus_income,
            "Счет": from_resource.get("displayedValue", ""),
            "Остаток после операции": billing_amount.get("amount", ""),
            "Статус": state.get("category", ""),
            "Тип": operation.get("type", ""),
            "Код категории": operation.get("classificationCode", ""),
            "ID": operation.get("uohId") or operation.get("externalId", ""),
        }

    @staticmethod
    def _write_xlsx(path: Path, rows: Iterable[dict[str, Any]]) -> None:
        rows = list(rows)
        headers = list(rows[0].keys()) if rows else [
            "Дата",
            "Получатель",
            "Описание",
            "Сумма операции",
            "Валюта операции",
            "Сумма в RUB",
            "Валюта",
            "Бонусы Спасибо",
            "Счет",
            "Остаток после операции",
            "Статус",
            "Тип",
            "Код категории",
            "ID",
        ]

        path.parent.mkdir(parents=True, exist_ok=True)
        worksheet_xml = _build_worksheet_xml(headers, rows)

        with zipfile.ZipFile(path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
            archive.writestr("[Content_Types].xml", _content_types_xml())
            archive.writestr("_rels/.rels", _root_rels_xml())
            archive.writestr("xl/workbook.xml", _workbook_xml())
            archive.writestr("xl/_rels/workbook.xml.rels", _workbook_rels_xml())
            archive.writestr("xl/styles.xml", _styles_xml())
            archive.writestr("xl/worksheets/sheet1.xml", worksheet_xml)


def _build_worksheet_xml(headers: list[str], rows: list[dict[str, Any]]) -> str:
    sheet_rows = [_build_row_xml(1, headers)]
    for index, row in enumerate(rows, start=2):
        sheet_rows.append(_build_row_xml(index, [row.get(header, "") for header in headers]))

    last_column = _column_name(len(headers))
    last_row = max(len(rows) + 1, 1)

    return (
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
        '<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" '
        'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">'
        f'<dimension ref="A1:{last_column}{last_row}"/>'
        '<sheetViews><sheetView workbookViewId="0"/></sheetViews>'
        '<sheetFormatPr defaultRowHeight="15"/>'
        '<sheetData>'
        f'{"".join(sheet_rows)}'
        '</sheetData>'
        '</worksheet>'
    )


def _build_row_xml(row_index: int, values: list[Any]) -> str:
    cells = []
    for column_index, value in enumerate(values, start=1):
        cell_reference = f"{_column_name(column_index)}{row_index}"
        cells.append(_build_cell_xml(cell_reference, value))
    return f'<row r="{row_index}">{"".join(cells)}</row>'


def _build_cell_xml(cell_reference: str, value: Any) -> str:
    if value is None:
        value = ""

    if isinstance(value, (int, float)) and not isinstance(value, bool):
        return f'<c r="{cell_reference}"><v>{value}</v></c>'

    return f'<c r="{cell_reference}" t="inlineStr"><is><t>{escape(str(value))}</t></is></c>'


def _column_name(index: int) -> str:
    name = ""
    while index:
        index, remainder = divmod(index - 1, 26)
        name = chr(65 + remainder) + name
    return name


def _content_types_xml() -> str:
    return (
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
        '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
        '<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
        '<Default Extension="xml" ContentType="application/xml"/>'
        '<Override PartName="/xl/workbook.xml" '
        'ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>'
        '<Override PartName="/xl/worksheets/sheet1.xml" '
        'ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>'
        '<Override PartName="/xl/styles.xml" '
        'ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.styles+xml"/>'
        '</Types>'
    )


def _root_rels_xml() -> str:
    return (
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
        '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
        '<Relationship Id="rId1" '
        'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
        'Target="xl/workbook.xml"/>'
        '</Relationships>'
    )


def _workbook_xml() -> str:
    return (
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
        '<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" '
        'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">'
        '<sheets><sheet name="Operations" sheetId="1" r:id="rId1"/></sheets>'
        '</workbook>'
    )


def _workbook_rels_xml() -> str:
    return (
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
        '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
        '<Relationship Id="rId1" '
        'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" '
        'Target="worksheets/sheet1.xml"/>'
        '<Relationship Id="rId2" '
        'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" '
        'Target="styles.xml"/>'
        '</Relationships>'
    )


def _styles_xml() -> str:
    return (
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
        '<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
        '<fonts count="1"><font><sz val="11"/><name val="Calibri"/></font></fonts>'
        '<fills count="1"><fill><patternFill patternType="none"/></fill></fills>'
        '<borders count="1"><border><left/><right/><top/><bottom/><diagonal/></border></borders>'
        '<cellStyleXfs count="1"><xf numFmtId="0" fontId="0" fillId="0" borderId="0"/></cellStyleXfs>'
        '<cellXfs count="1"><xf numFmtId="0" fontId="0" fillId="0" borderId="0" xfId="0"/></cellXfs>'
        '</styleSheet>'
    )


def _parse_args() -> argparse.Namespace:
    now = datetime.now()
    parser = argparse.ArgumentParser(description="Export Sber operations for one month to a single XLSX file.")
    parser.add_argument("--year", type=int, default=now.year, help="Year to export, for example 2026.")
    parser.add_argument("--month", type=int, default=now.month, help="Month to export, from 1 to 12.")
    parser.add_argument("--output", type=Path, default=None, help="Output XLSX path.")
    return parser.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    exporter = SberMonthHistoryExporter()
    output = exporter.save_month(args.year, args.month, args.output)
    print(f"Saved {args.year}-{args.month:02d} operations to {output}")