add support for converting a pdf to txt

This commit is contained in:
Maurice McCabe 2024-09-02 01:06:11 -07:00
parent aa5f319494
commit ff087129b0
2 changed files with 22 additions and 10 deletions

View File

@ -12,4 +12,5 @@ selenium==4.9.1
webdriver-manager==4.0.2 webdriver-manager==4.0.2
click click
git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
linkedin-api linkedin-api
PyPDF2==3.0.1

View File

@ -3,9 +3,9 @@ import yaml
from openai import OpenAI from openai import OpenAI
import os import os
from typing import Dict, Any from typing import Dict, Any
import tiktoken
import re import re
from jsonschema import validate, ValidationError from jsonschema import validate, ValidationError
import PyPDF2
def load_yaml(file_path: str) -> Dict[str, Any]: def load_yaml(file_path: str) -> Dict[str, Any]:
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
@ -27,10 +27,6 @@ def get_api_key() -> str:
return api_key return api_key
def num_tokens_from_string(string: str, model: str) -> int:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(string))
def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str: def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
client = OpenAI(api_key=api_key) client = OpenAI(api_key=api_key)
@ -96,7 +92,8 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
if match: if match:
return match.group(1).strip() return match.group(1).strip()
else: else:
raise ValueError("YAML content not found in the expected format") raise ValueError("YAML content not found in the expected format")
def save_yaml(data: str, output_file: str): def save_yaml(data: str, output_file: str):
with open(output_file, 'w') as file: with open(output_file, 'w') as file:
file.write(data) file.write(data)
@ -120,16 +117,30 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
print(report) print(report)
def pdf_to_text(pdf_path: str) -> str:
text = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text()
return text
def main(): def main():
parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API") parser = argparse.ArgumentParser(description="Generate a resume YAML file from a PDF or text resume using OpenAI API")
parser.add_argument("--input", required=True, help="Path to the input text resume file") parser.add_argument("--input", required=True, help="Path to the input resume file (PDF or TXT)")
parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file") parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
args = parser.parse_args() args = parser.parse_args()
try: try:
api_key = get_api_key() api_key = get_api_key()
schema = load_yaml("assets/resume_schema.yaml") schema = load_yaml("assets/resume_schema.yaml")
resume_text = load_resume_text(args.input)
# Check if input is PDF or TXT
if args.input.lower().endswith('.pdf'):
resume_text = pdf_to_text(args.input)
print(f"PDF resume converted to text successfully.")
else:
resume_text = load_resume_text(args.input)
generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key) generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
save_yaml(generated_yaml, args.output) save_yaml(generated_yaml, args.output)