From be8ddb8241bc0ca960fa7fec7f5376f4a7818c9b Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Sat, 31 Aug 2024 14:56:37 -0700
Subject: [PATCH 1/7] resume generator cmdline utility

---
 .gitignore                    |   2 +
 README.md                     |  21 +++++
 assets/resume_liam_murphy.txt |  55 ++++++++++++
 assets/resume_schema.yaml     | 132 +++++++++++++++++++++++++++
 requirements.txt              | Bin 670 -> 710 bytes
 resume_yaml_generator.py      | 164 ++++++++++++++++++++++++++++++++++
 6 files changed, 374 insertions(+)
 create mode 100644 assets/resume_liam_murphy.txt
 create mode 100644 assets/resume_schema.yaml
 create mode 100644 resume_yaml_generator.py

diff --git a/.gitignore b/.gitignore
index 4e73720..1e8e081 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ generated_cv*
 .vscode
 chrome_profile
 answers.json
+resume.yaml
+resume.yaml_validation_report.txt
diff --git a/README.md b/README.md
index 3258335..78c669e 100644
--- a/README.md
+++ b/README.md
@@ -452,6 +452,27 @@ Each section has specific fields to fill out:
     willing_to_undergo_drug_tests: "No"
     willing_to_undergo_background_checks: "Yes"
   ```
+### 4. Generating plain_text_resume.yaml from a Text Resume
+
+To simplify the process of creating your `plain_text_resume.yaml` file, you can use the provided script to generate it from a text-based resume. Follow these steps:
+
+1. Prepare your resume in a plain text format (.txt file).
+
+2. Place your text resume in the `data_folder` directory.
+
+3. Run the following command:
+
+   ```bash
+   python generate_resume_yaml.py --input data_folder/your_resume.txt --output data_folder/plain_text_resume.yaml
+   ```
+
+   Replace `your_resume.txt` with the actual name of your text resume file.
+
+4. The script will generate a `plain_text_resume.yaml` file in the `data_folder` directory.
+
+5. Review the generated YAML file and make any necessary adjustments to ensure all information is correct and complete.
+
+This automated process helps in creating a structured YAML file from your existing resume, saving time and reducing the chance of errors in manual data entry.
 
 ### PLUS. data_folder_example
 
diff --git a/assets/resume_liam_murphy.txt b/assets/resume_liam_murphy.txt
new file mode 100644
index 0000000..30b5402
--- /dev/null
+++ b/assets/resume_liam_murphy.txt
@@ -0,0 +1,55 @@
+Liam Murphy
+Galway, Ireland
+Email: liam.murphy@gmail.com | LinkedIn: liam-murphy
+GitHub: liam-murphy | Phone: +353 871234567
+
+Education
+Bachelor's Degree in Computer Science
+National University of Ireland, Galway (GPA: 4/4)
+Graduation Year: 2020
+
+Experience
+Co-Founder & Software Engineer
+CryptoWave Solutions (03/2021 - Present)
+Location: Ireland | Industry: Blockchain Technology
+
+Co-founded and led a startup specializing in app and software development with a focus on blockchain technology
+Provided blockchain consultations for 10+ companies, enhancing their software capabilities with secure, decentralized solutions
+Developed blockchain applications, integrated cutting-edge technology to meet client needs and drive industry innovation
+Research Intern
+National University of Ireland, Galway (11/2022 - 03/2023)
+Location: Galway, Ireland | Industry: IoT Security Research
+
+Conducted in-depth research on IoT security, focusing on binary instrumentation and runtime monitoring
+Performed in-depth study of the MQTT protocol and Falco
+Developed multiple software components including MQTT packet analysis library, Falco adapter, and RML monitor in Prolog
+Authored thesis "Binary Instrumentation for Runtime Monitoring of Internet of Things Systems Using Falco"
+Software Engineer
+University Hospital Galway (05/2022 - 11/2022)
+Location: Galway, Ireland | Industry: Healthcare IT
+
+Integrated and enforced robust security protocols
+Developed and maintained a critical software tool for password validation used by over 1,600 employees
+Played an integral role in the hospital's cybersecurity team
+Projects
+JobBot
+AI-driven tool to automate and personalize job applications on LinkedIn, gained over 3000 stars on GitHub, improving efficiency and reducing application time
+Link: JobBot
+
+mqtt-packet-parser
+Developed a Node.js module for parsing MQTT packets, improved parsing efficiency by 40%
+Link: mqtt-packet-parser
+
+Achievements
+Winner of an Irish public competition - Won first place in a public competition with a perfect score of 70/70, securing a Software Developer position at University Hospital Galway
+Galway Merit Scholarship - Awarded annually from 2018 to 2020 in recognition of academic excellence and contribution
+GitHub Recognition - Gained over 3000 stars on GitHub with JobBot project
+Certifications
+C1
+
+Languages
+English - Native
+Spanish - Professional
+Interests
+Full-Stack Development, Software Architecture, IoT system design and development, Artificial Intelligence, Cloud Technologies
+
diff --git a/assets/resume_schema.yaml b/assets/resume_schema.yaml
new file mode 100644
index 0000000..9a86f2f
--- /dev/null
+++ b/assets/resume_schema.yaml
@@ -0,0 +1,132 @@
+# YAML Schema for plain_text_resume.yaml
+
+personal_information:
+  type: object
+  properties:
+    name: {type: string}
+    surname: {type: string}
+    date_of_birth: {type: string, format: date}
+    country: {type: string}
+    city: {type: string}
+    address: {type: string}
+    phone_prefix: {type: string, format: phone_prefix}
+    phone: {type: string, format: phone}
+    email: {type: string, format: email}
+    github: {type: string, format: uri}
+    linkedin: {type: string, format: uri}
+  required: [name, surname, date_of_birth, country, city, address, phone_prefix, phone, email]
+
+education_details:
+  type: array
+  items:
+    type: object
+    properties:
+      degree: {type: string}
+      university: {type: string}
+      gpa: {type: string}
+      graduation_year: {type: string}
+      field_of_study: {type: string}
+      exam:
+        type: object
+        additionalProperties: {type: string}
+    required: [degree, university, gpa, graduation_year, field_of_study]
+
+experience_details:
+  type: array
+  items:
+    type: object
+    properties:
+      position: {type: string}
+      company: {type: string}
+      employment_period: {type: string}
+      location: {type: string}
+      industry: {type: string}
+      key_responsibilities:
+        type: object
+        additionalProperties: {type: string}
+      skills_acquired:
+        type: array
+        items: {type: string}
+    required: [position, company, employment_period, location, industry, key_responsibilities, skills_acquired]
+
+projects:
+  type: array
+  items:
+    type: object
+    properties:
+      name: {type: string}
+      description: {type: string}
+      link: {type: string, format: uri}
+    required: [name, description]
+
+achievements:
+  type: array
+  items:
+    type: object
+    properties:
+      name: {type: string}
+      description: {type: string}
+    required: [name, description]
+
+certifications:
+  type: array
+  items: {type: string}
+
+languages:
+  type: array
+  items:
+    type: object
+    properties:
+      language: {type: string}
+      proficiency: {type: string, enum: [Native, Fluent, Intermediate, Beginner]}
+    required: [language, proficiency]
+
+interests:
+  type: array
+  items: {type: string}
+
+availability:
+  type: object
+  properties:
+    notice_period: {type: string}
+  required: [notice_period]
+
+salary_expectations:
+  type: object
+  properties:
+    salary_range_usd: {type: string}
+  required: [salary_range_usd]
+
+self_identification:
+  type: object
+  properties:
+    gender: {type: string}
+    pronouns: {type: string}
+    veteran: {type: string, enum: [Yes, No]}
+    disability: {type: string, enum: [Yes, No]}
+    ethnicity: {type: string}
+  required: [gender, pronouns, veteran, disability, ethnicity]
+
+legal_authorization:
+  type: object
+  properties:
+    eu_work_authorization: {type: string, enum: [Yes, No]}
+    us_work_authorization: {type: string, enum: [Yes, No]}
+    requires_us_visa: {type: string, enum: [Yes, No]}
+    requires_us_sponsorship: {type: string, enum: [Yes, No]}
+    requires_eu_visa: {type: string, enum: [Yes, No]}
+    legally_allowed_to_work_in_eu: {type: string, enum: [Yes, No]}
+    legally_allowed_to_work_in_us: {type: string, enum: [Yes, No]}
+    requires_eu_sponsorship: {type: string, enum: [Yes, No]}
+  required: [eu_work_authorization, us_work_authorization, requires_us_visa, requires_us_sponsorship, requires_eu_visa, legally_allowed_to_work_in_eu, legally_allowed_to_work_in_us, requires_eu_sponsorship]
+
+work_preferences:
+  type: object
+  properties:
+    remote_work: {type: string, enum: [Yes, No]}
+    in_person_work: {type: string, enum: [Yes, No]}
+    open_to_relocation: {type: string, enum: [Yes, No]}
+    willing_to_complete_assessments: {type: string, enum: [Yes, No]}
+    willing_to_undergo_drug_tests: {type: string, enum: [Yes, No]}
+    willing_to_undergo_background_checks: {type: string, enum: [Yes, No]}
+  required: [remote_work, in_person_work, open_to_relocation, willing_to_complete_assessments, willing_to_undergo_drug_tests, willing_to_undergo_background_checks]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index f74a689c00362ab6724bec113a729791b031cce0..c473e7251426a7a8eb528daafe8dadc3b19202cf 100644
GIT binary patch
delta 48
zcmbQodW?0$JSGiZ1}=syhGK?%hCCpd%#gv5%8<*D$Y2YECJcHEMhwP4(trT~2HgnA

delta 7
OcmX@cI*)b3JSG4P2LitU

diff --git a/resume_yaml_generator.py b/resume_yaml_generator.py
new file mode 100644
index 0000000..00e6922
--- /dev/null
+++ b/resume_yaml_generator.py
@@ -0,0 +1,164 @@
+import argparse
+import yaml
+from openai import OpenAI
+import os
+from typing import Dict, Any
+import tiktoken
+import re
+from jsonschema import validate, ValidationError
+
+def load_yaml(file_path: str) -> Dict[str, Any]:
+    with open(file_path, 'r') as file:
+        return yaml.safe_load(file)
+
+def load_resume_text(file_path: str) -> str:
+    with open(file_path, 'r') as file:
+        return file.read()
+
+def get_api_key() -> str:
+    secrets_path = os.path.join('data_folder', 'secrets.yaml')
+    if not os.path.exists(secrets_path):
+        raise FileNotFoundError(f"Secrets file not found at {secrets_path}")
+    
+    secrets = load_yaml(secrets_path)
+    api_key = secrets.get('openai_api_key')
+    if not api_key:
+        raise ValueError("OpenAI API key not found in secrets.yaml")
+    
+    return api_key
+
+def num_tokens_from_string(string: str, model: str) -> int:
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(string))
+
+def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
+    client = OpenAI(api_key=api_key)
+
+    prompt = f"""
+    I'm sending you the content of a text-based resume. Your task is to interpret this content and generate a YAML file that conforms to the following schema structure.
+    The generated YAML should include all required fields and follow the structure defined in the schema.
+
+    Pay special attention to the property attributes in the schema. These indicate the expected type and format for each field:
+    - 'type': Specifies the data type (e.g., string, object, array)
+    - 'format': Indicates a specific format for certain fields:
+    - 'date' format should be a valid date (e.g., YYYY-MM-DD)
+    - 'phone_prefix' format should be a valid country code with a '+' prefix (e.g., +1 for US)
+    - 'phone' format should be a valid phone number
+    - 'email' format should be a valid email address
+    - 'uri' format should be a valid URL
+    - 'enum': Provides a list of allowed values for a field
+
+    Important instructions:
+    1. Ensure that the YAML structure matches exactly with the provided schema. Use a dictionary structure that mirrors the schema.
+    2. For all sections, if information is not explicitly provided in the resume, make a best guess based on the context of the resume. This is CRUCIAL for the following fields:
+    - languages: Infer from the resume content or make an educated guess. Use the 'enum' values for proficiency.
+    - interests: Deduce from the overall resume or related experiences.
+    - availability (notice_period): Provide a reasonable estimate (e.g., "2 weeks" or "1 month").
+    - salary_expectations (salary_range_usd): Estimate based on experience level and industry standards.
+    - self_identification: Make reasonable assumptions based on the resume context. Use 'enum' values where provided.
+    - legal_authorization: Provide plausible values based on the resume information. Use 'Yes' or 'No' as per the 'enum' values.
+    - work_preferences: Infer from job history, skills, and overall resume tone. Use 'Yes' or 'No' as per the 'enum' values.
+    3. For the fields mentioned in point 2, always provide a value. Do not leave them blank or omit them.
+    4. For the 'key_responsibilities' field in 'experience_details', format the responsibilities as follows:
+    responsibility_1: "Description of first responsibility"
+    responsibility_2: "Description of second responsibility"
+    responsibility_3: "Description of third responsibility"
+    responsibility_4: "Description of fourth responsibility"
+    Continue this pattern for all responsibilities listed.
+    5. In the 'experience_details' section, ensure that 'position' comes before 'company' in each entry.
+    6. For the 'skills_acquired' field in 'experience_details', infer relevant skills based on the job responsibilities and industry. Do not leave this field empty.
+    7. Make reasonable inferences for any missing dates, such as date_of_birth or employment dates, ensuring they follow the 'date' format.
+    8. For array types (e.g., education_details, experience_details), ensure to include all required fields for each item as specified in the schema.
+
+    Resume Text Content:
+    {resume_text}
+
+    YAML Schema:
+    {yaml.dump(schema, default_flow_style=False)}
+
+    Generate the YAML content that matches this schema based on the resume content provided, ensuring all format hints are followed and making educated guesses where necessary. Be sure to include best guesses for ALL fields, even if not explicitly mentioned in the resume.
+    Enclose your response in <resume_yaml> tags. Only include the YAML content within these tags, without any additional text or code block markers.
+    """
+    
+    model = "gpt-3.5-turbo-16k"  # This model has a 16k token limit
+    tokens = num_tokens_from_string(prompt, model)
+    max_tokens = min(16385 - tokens, 4000)  # Ensure we don't exceed model's limit
+
+    if tokens > 16385:
+        print(f"Warning: The input exceeds the model's context length. Tokens: {tokens}")
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant that generates structured YAML content from resume files, paying close attention to format requirements and schema structure."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=max_tokens,
+        n=1,
+        stop=None,
+        temperature=0.5,
+    )
+
+    yaml_content = response.choices[0].message.content.strip()
+    
+    # Extract YAML content from between the tags
+    match = re.search(r'<resume_yaml>(.*?)</resume_yaml>', yaml_content, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    else:
+        raise ValueError("YAML content not found in the expected format")
+                                                                                                                                                                                                                                                                                                                                                                                        
+def save_yaml(data: str, output_file: str):
+    with open(output_file, 'w') as file:
+        file.write(data)
+
+def validate_yaml(yaml_content: str, schema: Dict[str, Any]) -> Dict[str, Any]:
+    try:
+        yaml_dict = yaml.safe_load(yaml_content)
+        validate(instance=yaml_dict, schema=schema)
+        return {"valid": True, "errors": None}
+    except ValidationError as e:
+        return {"valid": False, "errors": str(e)}
+
+def generate_report(validation_result: Dict[str, Any], output_file: str):
+    report = f"Validation Report for {output_file}\n"
+    report += "=" * 40 + "\n"
+    if validation_result["valid"]:
+        report += "YAML is valid and conforms to the schema.\n"
+    else:
+        report += "YAML is not valid. Errors:\n"
+        report += validation_result["errors"] + "\n"
+    
+    print(report)
+    with open(f"{output_file}_validation_report.txt", 'w') as file:
+        file.write(report)
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API")
+    parser.add_argument("resume_file", help="Path to the input text resume file")
+    parser.add_argument("schema_file", help="Path to the YAML schema file")
+    parser.add_argument("output_file", help="Path to the output YAML file")
+    args = parser.parse_args()
+
+    try:
+        api_key = get_api_key()
+        schema = load_yaml(args.schema_file)
+        resume_text = load_resume_text(args.resume_file)
+
+        generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
+        save_yaml(generated_yaml, args.output_file)
+
+        print(f"Resume YAML generated and saved to {args.output_file}")
+
+        validation_result = validate_yaml(generated_yaml, schema)
+        generate_report(validation_result, args.output_file)
+
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+    except ValueError as e:
+        print(f"Error: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 93691913f99d2b432f1a733b6ee8975a29ddaa34 Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Sat, 31 Aug 2024 15:41:08 -0700
Subject: [PATCH 2/7] update cmdline utility to match README

---
 {assets => data_folder_example}/resume_liam_murphy.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {assets => data_folder_example}/resume_liam_murphy.txt (100%)

diff --git a/assets/resume_liam_murphy.txt b/data_folder_example/resume_liam_murphy.txt
similarity index 100%
rename from assets/resume_liam_murphy.txt
rename to data_folder_example/resume_liam_murphy.txt

From 67543499ab881909d3f5496d6c6404235953fdcc Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Sat, 31 Aug 2024 15:41:33 -0700
Subject: [PATCH 3/7] update cmdline utility to match README

---
 .gitignore               |  2 --
 resume_yaml_generator.py | 27 ++++++++++++---------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1e8e081..4e73720 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,3 @@ generated_cv*
 .vscode
 chrome_profile
 answers.json
-resume.yaml
-resume.yaml_validation_report.txt
diff --git a/resume_yaml_generator.py b/resume_yaml_generator.py
index 00e6922..dcc6607 100644
--- a/resume_yaml_generator.py
+++ b/resume_yaml_generator.py
@@ -130,35 +130,32 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
         report += validation_result["errors"] + "\n"
     
     print(report)
-    with open(f"{output_file}_validation_report.txt", 'w') as file:
-        file.write(report)
 
 def main():
     parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API")
-    parser.add_argument("resume_file", help="Path to the input text resume file")
-    parser.add_argument("schema_file", help="Path to the YAML schema file")
-    parser.add_argument("output_file", help="Path to the output YAML file")
+    parser.add_argument("--input", required=True, help="Path to the input text resume file")
+    parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
     args = parser.parse_args()
 
     try:
         api_key = get_api_key()
-        schema = load_yaml(args.schema_file)
-        resume_text = load_resume_text(args.resume_file)
+        schema = load_yaml("assets/resume_schema.yaml")
+        resume_text = load_resume_text(args.input)
 
         generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
-        save_yaml(generated_yaml, args.output_file)
+        save_yaml(generated_yaml, args.output)
 
-        print(f"Resume YAML generated and saved to {args.output_file}")
+        print(f"Resume YAML generated and saved to {args.output}")
 
         validation_result = validate_yaml(generated_yaml, schema)
-        generate_report(validation_result, args.output_file)
+        if validation_result["valid"]:
+            print("YAML is valid and conforms to the schema.")
+        else:
+            print("YAML is not valid. Errors:")
+            print(validation_result["errors"])
 
-    except FileNotFoundError as e:
-        print(f"Error: {e}")
-    except ValueError as e:
-        print(f"Error: {e}")
     except Exception as e:
-        print(f"An unexpected error occurred: {e}")
+        print(f"An error occurred: {e}")
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 369c23791da7ffeb789c66f1e54a3fce9bcf28e2 Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Sat, 31 Aug 2024 15:58:55 -0700
Subject: [PATCH 4/7] use gpt-4o-mini

---
 resume_yaml_generator.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/resume_yaml_generator.py b/resume_yaml_generator.py
index dcc6607..0252caa 100644
--- a/resume_yaml_generator.py
+++ b/resume_yaml_generator.py
@@ -79,23 +79,13 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
     Generate the YAML content that matches this schema based on the resume content provided, ensuring all format hints are followed and making educated guesses where necessary. Be sure to include best guesses for ALL fields, even if not explicitly mentioned in the resume.
     Enclose your response in <resume_yaml> tags. Only include the YAML content within these tags, without any additional text or code block markers.
     """
-    
-    model = "gpt-3.5-turbo-16k"  # This model has a 16k token limit
-    tokens = num_tokens_from_string(prompt, model)
-    max_tokens = min(16385 - tokens, 4000)  # Ensure we don't exceed model's limit
-
-    if tokens > 16385:
-        print(f"Warning: The input exceeds the model's context length. Tokens: {tokens}")
 
     response = client.chat.completions.create(
-        model=model,
+        model="gpt-4o-mini",
         messages=[
             {"role": "system", "content": "You are a helpful assistant that generates structured YAML content from resume files, paying close attention to format requirements and schema structure."},
             {"role": "user", "content": prompt}
         ],
-        max_tokens=max_tokens,
-        n=1,
-        stop=None,
         temperature=0.5,
     )
 
@@ -106,8 +96,7 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
     if match:
         return match.group(1).strip()
     else:
-        raise ValueError("YAML content not found in the expected format")
-                                                                                                                                                                                                                                                                                                                                                                                        
+        raise ValueError("YAML content not found in the expected format")                                                                                                                                                                                                                                                                                                                                                                                        
 def save_yaml(data: str, output_file: str):
     with open(output_file, 'w') as file:
         file.write(data)

From ff087129b09bb21635a106e639d5ffd173f1dd4e Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Mon, 2 Sep 2024 01:06:11 -0700
Subject: [PATCH 5/7] add support for converting a pdf to txt

---
 requirements.txt         |  3 ++-
 resume_yaml_generator.py | 29 ++++++++++++++++++++---------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7f6e144..5139ade 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,5 @@ selenium==4.9.1
 webdriver-manager==4.0.2
 click
 git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
-linkedin-api
\ No newline at end of file
+linkedin-api
+PyPDF2==3.0.1
\ No newline at end of file
diff --git a/resume_yaml_generator.py b/resume_yaml_generator.py
index 0252caa..acf36f5 100644
--- a/resume_yaml_generator.py
+++ b/resume_yaml_generator.py
@@ -3,9 +3,9 @@ import yaml
 from openai import OpenAI
 import os
 from typing import Dict, Any
-import tiktoken
 import re
 from jsonschema import validate, ValidationError
+import PyPDF2
 
 def load_yaml(file_path: str) -> Dict[str, Any]:
     with open(file_path, 'r') as file:
@@ -27,10 +27,6 @@ def get_api_key() -> str:
     
     return api_key
 
-def num_tokens_from_string(string: str, model: str) -> int:
-    encoding = tiktoken.encoding_for_model(model)
-    return len(encoding.encode(string))
-
 def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key: str) -> str:
     client = OpenAI(api_key=api_key)
 
@@ -96,7 +92,8 @@ def generate_yaml_from_resume(resume_text: str, schema: Dict[str, Any], api_key:
     if match:
         return match.group(1).strip()
     else:
-        raise ValueError("YAML content not found in the expected format")                                                                                                                                                                                                                                                                                                                                                                                        
+        raise ValueError("YAML content not found in the expected format")
+
 def save_yaml(data: str, output_file: str):
     with open(output_file, 'w') as file:
         file.write(data)
@@ -120,16 +117,30 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
     
     print(report)
 
+def pdf_to_text(pdf_path: str) -> str:
+    text = ""
+    with open(pdf_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        for page in reader.pages:
+            text += page.extract_text()
+    return text
+
 def main():
-    parser = argparse.ArgumentParser(description="Generate a resume YAML file from a text resume using OpenAI API")
-    parser.add_argument("--input", required=True, help="Path to the input text resume file")
+    parser = argparse.ArgumentParser(description="Generate a resume YAML file from a PDF or text resume using OpenAI API")
+    parser.add_argument("--input", required=True, help="Path to the input resume file (PDF or TXT)")
     parser.add_argument("--output", default="data_folder/plain_text_resume.yaml", help="Path to the output YAML file")
     args = parser.parse_args()
 
     try:
         api_key = get_api_key()
         schema = load_yaml("assets/resume_schema.yaml")
-        resume_text = load_resume_text(args.input)
+
+        # Check if input is PDF or TXT
+        if args.input.lower().endswith('.pdf'):
+            resume_text = pdf_to_text(args.input)
+            print(f"PDF resume converted to text successfully.")
+        else:
+            resume_text = load_resume_text(args.input)
 
         generated_yaml = generate_yaml_from_resume(resume_text, schema, api_key)
         save_yaml(generated_yaml, args.output)

From 23567ee7c48618d10a5acdc14665b16321a7104f Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Mon, 2 Sep 2024 01:25:38 -0700
Subject: [PATCH 6/7] updated for generating resume yaml from pdf

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 76621d7..edb4dc8 100644
--- a/README.md
+++ b/README.md
@@ -452,21 +452,21 @@ Each section has specific fields to fill out:
     willing_to_undergo_drug_tests: "No"
     willing_to_undergo_background_checks: "Yes"
   ```
-### 4. Generating plain_text_resume.yaml from a Text Resume
+### 4. Generating plain_text_resume.yaml from a PDF or Text Resume
 
-To simplify the process of creating your `plain_text_resume.yaml` file, you can use the provided script to generate it from a text-based resume. Follow these steps:
+To simplify the process of creating your `plain_text_resume.yaml` file, you can use the provided script to generate it from a pdf-based or text-based resume. Follow these steps:
 
-1. Prepare your resume in a plain text format (.txt file).
+1. Prepare your resume in a pdf (.pdf file) or plain text (.txt file) format.
 
-2. Place your text resume in the `data_folder` directory.
+2. Place your resume in the `data_folder` directory.
 
 3. Run the following command:
 
    ```bash
-   python generate_resume_yaml.py --input data_folder/your_resume.txt --output data_folder/plain_text_resume.yaml
+   python generate_resume_yaml.py --input data_folder/your_resume.[pdf|txt] --output data_folder/plain_text_resume.yaml
    ```
 
-   Replace `your_resume.txt` with the actual name of your text resume file.
+   Replace `your_resume.[pdf|txt]` with the actual name of your pdf or text resume file.
 
 4. The script will generate a `plain_text_resume.yaml` file in the `data_folder` directory.
 

From 685da0f9fc71dbb7a0c869efa2f702bab1cdcc1e Mon Sep 17 00:00:00 2001
From: Maurice McCabe <mmcc007@gmail.com>
Date: Mon, 2 Sep 2024 01:44:54 -0700
Subject: [PATCH 7/7] replace lib PYPDF2 with pdfminer.six

---
 requirements.txt         | 32 ++++++++++++++++----------------
 resume_yaml_generator.py |  9 ++-------
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5139ade..03290b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,16 @@
-langchain==0.2.11
-langchain-community==0.2.10
-langchain-core==0.2.24
-langchain-openai==0.1.17
-langchain-text-splitters==0.2.2
-langsmith==0.1.93
-Levenshtein==0.25.1
-openai==1.37.1
-regex==2024.7.24
-reportlab==4.2.2
-selenium==4.9.1
-webdriver-manager==4.0.2
-click
-git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
-linkedin-api
-PyPDF2==3.0.1
\ No newline at end of file
+langchain==0.2.11
+langchain-community==0.2.10
+langchain-core==0.2.24
+langchain-openai==0.1.17
+langchain-text-splitters==0.2.2
+langsmith==0.1.93
+Levenshtein==0.25.1
+openai==1.37.1
+regex==2024.7.24
+reportlab==4.2.2
+selenium==4.9.1
+webdriver-manager==4.0.2
+click
+git+https://github.com/feder-cr/lib_resume_builder_AIHawk.git
+linkedin-api
+pdfminer.six==20221105
\ No newline at end of file
diff --git a/resume_yaml_generator.py b/resume_yaml_generator.py
index acf36f5..46982c2 100644
--- a/resume_yaml_generator.py
+++ b/resume_yaml_generator.py
@@ -5,7 +5,7 @@ import os
 from typing import Dict, Any
 import re
 from jsonschema import validate, ValidationError
-import PyPDF2
+from pdfminer.high_level import extract_text
 
 def load_yaml(file_path: str) -> Dict[str, Any]:
     with open(file_path, 'r') as file:
@@ -118,12 +118,7 @@ def generate_report(validation_result: Dict[str, Any], output_file: str):
     print(report)
 
 def pdf_to_text(pdf_path: str) -> str:
-    text = ""
-    with open(pdf_path, 'rb') as file:
-        reader = PyPDF2.PdfReader(file)
-        for page in reader.pages:
-            text += page.extract_text()
-    return text
+    return extract_text(pdf_path)
 
 def main():
     parser = argparse.ArgumentParser(description="Generate a resume YAML file from a PDF or text resume using OpenAI API")