add support for converting a pdf to txt
This commit is contained in:
parent
aa5f319494
commit
ff087129b0
@ -12,4 +12,5 @@ selenium==4.9.1
|
|||||||
webdriver-manager==4.0.2
|
webdriver-manager==4.0.2
|
||||||
click
|
click
|
||||||
git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
|
git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
|
||||||
linkedin-api
|
linkedin-api
|
||||||
|
PyPDF2==3.0.1
|
@ -3,9 +3,9 @@ import yaml
|
|||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
import os
|
import os
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
import tiktoken
|
|
||||||
import re
|
import re
|
||||||
from jsonschema import validate, ValidationError
|
from jsonschema import validate, ValidationError
|
||||||
|
import PyPDF2
|
||||||
|
|
||||||
def load_yaml(file_path: str) -> Dict[str, Any]:
|
def load_yaml(file_path: str) -> Dict[str, Any]:
|
||||||
with open(file_path, 'r') as file:
|
with open(file_path, 'r') as file:
|
||||||
@ -27,10 +27,6 @@ def get_api_key() -> str:
|
|||||||
|
|
||||||
return api_key
|
return api_key
|
||||||
|
|
||||||
def num_tokens_from_string(string: str, model: str) -> int:
|
|
||||||
encoding = tiktoken.encoding_for_model(model)
|
|
||||||
return len(encoding.encode(string))
|
|
||||||
|
|
||||||
def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
|
def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
|
||||||
client = OpenAI(api_key=api_key)
|
client = OpenAI(api_key=api_key)
|
||||||
|
|
||||||
@ -96,7 +92,8 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
|
|||||||
if match:
|
if match:
|
||||||
return match.group(1).strip()
|
return match.group(1).strip()
|
||||||
else:
|
else:
|
||||||
raise ValueError("YAML content not found in the expected format")
|
raise ValueError("YAML content not found in the expected format")
|
||||||
|
|
||||||
def save_yaml(data: str, output_file: str):
|
def save_yaml(data: str, output_file: str):
|
||||||
with open(output_file, 'w') as file:
|
with open(output_file, 'w') as file:
|
||||||
file.write(data)
|
file.write(data)
|
||||||
@ -120,16 +117,30 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
|
|||||||
|
|
||||||
print(report)
|
print(report)
|
||||||
|
|
||||||
|
def pdf_to_text(pdf_path: str) -> str:
|
||||||
|
text = ""
|
||||||
|
with open(pdf_path, 'rb') as file:
|
||||||
|
reader = PyPDF2.PdfReader(file)
|
||||||
|
for page in reader.pages:
|
||||||
|
text += page.extract_text()
|
||||||
|
return text
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API")
|
parser = argparse.ArgumentParser(description="Generate a resume YAML file from a PDF or text resume using OpenAI API")
|
||||||
parser.add_argument("--input", required=True, help="Path to the input text resume file")
|
parser.add_argument("--input", required=True, help="Path to the input resume file (PDF or TXT)")
|
||||||
parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
|
parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
api_key = get_api_key()
|
api_key = get_api_key()
|
||||||
schema = load_yaml("assets/resume_schema.yaml")
|
schema = load_yaml("assets/resume_schema.yaml")
|
||||||
resume_text = load_resume_text(args.input)
|
|
||||||
|
# Check if input is PDF or TXT
|
||||||
|
if args.input.lower().endswith('.pdf'):
|
||||||
|
resume_text = pdf_to_text(args.input)
|
||||||
|
print(f"PDF resume converted to text successfully.")
|
||||||
|
else:
|
||||||
|
resume_text = load_resume_text(args.input)
|
||||||
|
|
||||||
generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
|
generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
|
||||||
save_yaml(generated_yaml, args.output)
|
save_yaml(generated_yaml, args.output)
|
||||||
|
Loading…
Reference in New Issue
Block a user