import PyPDF2 as pdf import re import sys import math from datetime import datetime import model class Regex: def error(self, msg): print(f"ERROR: {self.__class__.__name__} failed: {msg}") sys.exit(-1) def extract(self, text): return None class Balance(Regex): pattern = r'\nNew\sBalance.*\$([0-9,\.]*)' def extract(self, text): results = re.findall(self.pattern, text) if len(results) == 0: self.error("No matches.") if len(results) > 1: self.error(f"Too many matches {len(results)}") return float(results[0].replace(',','')) class Year(Regex): pattern = r'\nAvailable\sCredit\s+\(as\sof\s(.*)\)' def extract(self, text): results = re.findall(self.pattern, text) if len(results) == 0: self.error("No matches.") if len(results) > 1: self.error(f"Too many matches {len(results)}") date = datetime.strptime(results[0], '%b %d, %Y') return date.year class Transactions(Regex): pattern = r'\n([a-zA-Z]{3}\s[0-9]{1,2})\s[a-zA-Z]{3}\s[0-9]{1,2}(.*[a-zA-Z])\s*(\-?)\s\$([0-9,\.]*)' def extract(self, text): results = re.findall(self.pattern, text) transactions = [] for result in results: if len(result) != 4: self.error("ERROR: Invalid result.") date = result[0] description = " ".join(result[1].split()) is_payment = '-' in result[2] amount = float(result[3].replace(',', '')) if is_payment: print(f"Skipping payment: {amount}") continue transactions.append([date, description, amount]) return transactions class Parser(model.BaseParser): def __init__(self): self.balance = Balance() self.year = Year() self.transactions = Transactions() @property def source(self): return model.TransactionSource.CAPITAL_ONE def parse(self, file_name): text = '' with open(file_name, 'rb') as pdf_file: reader = pdf.PdfReader(pdf_file) for page in reader.pages: text += page.extract_text() balance = self.balance.extract(text) year = self.year.extract(text) transactions = self.transactions.extract(text) # Validate transactions match extracted budget total = sum(x[2] for x in transactions) if not math.isclose(total, balance, abs_tol=0.001): print(f"ERROR: Actual {total} != Expected {balance}") sys.exit(-1) # Add year to all the parsed dates. for transaction in transactions: orig_date = transaction[0] orig_date = datetime.strptime(orig_date, "%b %d") new_date = orig_date.replace(year = year) transaction[0] = new_date return transactions