add support for converting a pdf to txt

This commit is contained in:
Maurice McCabe 2024-09-02 01:06:11 -07:00
parent aa5f319494
commit ff087129b0
2 changed files with 22 additions and 10 deletions

View File

@ -12,4 +12,5 @@ selenium==4.9.1
webdriver-manager==4.0.2
click
git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
linkedin-api
linkedin-api
PyPDF2==3.0.1

View File

@ -3,9 +3,9 @@ import yaml
from openai import OpenAI
import os
from typing import Dict, Any
import tiktoken
import re
from jsonschema import validate, ValidationError
import PyPDF2
def load_yaml(file_path: str) -> Dict[str, Any]:
with open(file_path, 'r') as file:
@ -27,10 +27,6 @@ def get_api_key() -> str:
return api_key
def num_tokens_from_string(string: str, model: str) -> int:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(string))
def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
client = OpenAI(api_key=api_key)
@ -96,7 +92,8 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
if match:
return match.group(1).strip()
else:
raise ValueError("YAML content not found in the expected format")
raise ValueError("YAML content not found in the expected format")
def save_yaml(data: str, output_file: str):
with open(output_file, 'w') as file:
file.write(data)
@ -120,16 +117,30 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
print(report)
def pdf_to_text(pdf_path: str) -> str:
text = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text()
return text
def main():
parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API")
parser.add_argument("--input", required=True, help="Path to the input text resume file")
parser = argparse.ArgumentParser(description="Generate a resume YAML file from a PDF or text resume using OpenAI API")
parser.add_argument("--input", required=True, help="Path to the input resume file (PDF or TXT)")
parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
args = parser.parse_args()
try:
api_key = get_api_key()
schema = load_yaml("assets/resume_schema.yaml")
resume_text = load_resume_text(args.input)
# Check if input is PDF or TXT
if args.input.lower().endswith('.pdf'):
resume_text = pdf_to_text(args.input)
print(f"PDF resume converted to text successfully.")
else:
resume_text = load_resume_text(args.input)
generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
save_yaml(generated_yaml, args.output)