104 lines
3.1 KiB
Python
Executable File
104 lines
3.1 KiB
Python
Executable File
import PyPDF2 as pdf
|
|
import re
|
|
import sys
|
|
import math
|
|
from datetime import datetime
|
|
|
|
import model
|
|
|
|
class Regex:
|
|
def error(self, msg):
|
|
print(f"ERROR: {self.__class__.__name__} failed: {msg}")
|
|
sys.exit(-1)
|
|
|
|
def extract(self, text):
|
|
return None
|
|
|
|
class Balance(Regex):
|
|
pattern = r'\nNew\sBalance.*\$([0-9,\.]*)'
|
|
|
|
def extract(self, text):
|
|
results = re.findall(self.pattern, text)
|
|
if len(results) == 0:
|
|
self.error("No matches.")
|
|
|
|
if len(results) > 1:
|
|
self.error(f"Too many matches {len(results)}")
|
|
|
|
return float(results[0].replace(',',''))
|
|
|
|
class Year(Regex):
|
|
pattern = r'\nAvailable\sCredit\s+\(as\sof\s(.*)\)'
|
|
|
|
def extract(self, text):
|
|
results = re.findall(self.pattern, text)
|
|
if len(results) == 0:
|
|
self.error("No matches.")
|
|
|
|
if len(results) > 1:
|
|
self.error(f"Too many matches {len(results)}")
|
|
|
|
date = datetime.strptime(results[0], '%b %d, %Y')
|
|
return date.year
|
|
|
|
class Transactions(Regex):
|
|
pattern = r'\n([a-zA-Z]{3}\s[0-9]{1,2})\s[a-zA-Z]{3}\s[0-9]{1,2}(.*[a-zA-Z])\s*(\-?)\s\$([0-9,\.]*)'
|
|
|
|
def extract(self, text):
|
|
results = re.findall(self.pattern, text)
|
|
transactions = []
|
|
for result in results:
|
|
if len(result) != 4:
|
|
self.error("ERROR: Invalid result.")
|
|
|
|
date = result[0]
|
|
description = " ".join(result[1].split())
|
|
is_credit = '-' in result[2]
|
|
amount = float(result[3].replace(',', ''))
|
|
|
|
if is_credit:
|
|
if "AUTOPAY" in description:
|
|
print(f"Skipping payment: {amount}")
|
|
continue
|
|
|
|
amount *= -1
|
|
|
|
transactions.append([date, description, amount])
|
|
|
|
return transactions
|
|
|
|
class Parser(model.BaseParser):
|
|
def __init__(self):
|
|
self.balance = Balance()
|
|
self.year = Year()
|
|
self.transactions = Transactions()
|
|
|
|
@property
|
|
def source(self):
|
|
return model.TransactionSource.CAPITAL_ONE
|
|
|
|
def parse(self, file_name):
|
|
text = ''
|
|
with open(file_name, 'rb') as pdf_file:
|
|
reader = pdf.PdfReader(pdf_file)
|
|
for page in reader.pages:
|
|
text += page.extract_text()
|
|
|
|
balance = self.balance.extract(text)
|
|
year = self.year.extract(text)
|
|
transactions = self.transactions.extract(text)
|
|
|
|
# Validate transactions match extracted budget
|
|
total = sum(x[2] for x in transactions)
|
|
if not math.isclose(total, balance, abs_tol=0.001):
|
|
print(f"ERROR: Actual {total} != Expected {balance}")
|
|
sys.exit(-1)
|
|
|
|
# Add year to all the parsed dates.
|
|
for transaction in transactions:
|
|
orig_date = transaction[0]
|
|
orig_date = datetime.strptime(orig_date, "%b %d")
|
|
new_date = orig_date.replace(year = year)
|
|
transaction[0] = new_date
|
|
|
|
return transactions |