add support for converting a pdf to txt

2024-09-02 01:06:11 -07:00 · 2024-09-02 01:06:11 -07:00 · ff087129b0
commit ff087129b0
parent aa5f319494
2 changed files with 22 additions and 10 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -12,4 +12,5 @@ selenium==4.9.1
 webdriver-manager==4.0.2
 click
 git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
-linkedin-api
+linkedin-api
+PyPDF2==3.0.1
--- a/resume_yaml_generator.py
+++ b/resume_yaml_generator.py
@ -3,9 +3,9 @@ import yaml
 from openai import OpenAI
 import os
 from typing import Dict, Any
-import tiktoken
 import re
 from jsonschema import validate, ValidationError
+import PyPDF2

 def load_yaml(file_path: str) -> Dict[str, Any]:
    with open(file_path, 'r') as file:
@ -27,10 +27,6 @@ def get_api_key() -> str:
    
    return api_key

-def num_tokens_from_string(string: str, model: str) -> int:
-    encoding = tiktoken.encoding_for_model(model)
-    return len(encoding.encode(string))
-
 def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
    client = OpenAI(api_key=api_key)

@ -96,7 +92,8 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
    if match:
        return match.group(1).strip()
    else:
-        raise ValueError("YAML content not found in the expected format")                                                                                                                                                                                                                                                                                                                                                                                        
+        raise ValueError("YAML content not found in the expected format")
+
 def save_yaml(data: str, output_file: str):
    with open(output_file, 'w') as file:
        file.write(data)
@ -120,16 +117,30 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
    
    print(report)

+def pdf_to_text(pdf_path: str) -> str:
+    text = ""
+    with open(pdf_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        for page in reader.pages:
+            text += page.extract_text()
+    return text
+
 def main():
-    parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API")
-    parser.add_argument("--input", required=True, help="Path to the input text resume file")
+    parser = argparse.ArgumentParser(description="Generate a resume YAML file from a PDF or text resume using OpenAI API")
+    parser.add_argument("--input", required=True, help="Path to the input resume file (PDF or TXT)")
    parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
    args = parser.parse_args()

    try:
        api_key = get_api_key()
        schema = load_yaml("assets/resume_schema.yaml")
-        resume_text = load_resume_text(args.input)
+
+        # Check if input is PDF or TXT
+        if args.input.lower().endswith('.pdf'):
+            resume_text = pdf_to_text(args.input)
+            print(f"PDF resume converted to text successfully.")
+        else:
+            resume_text = load_resume_text(args.input)

        generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
        save_yaml(generated_yaml, args.output)