493 lines
19 KiB
Python
493 lines
19 KiB
Python
import fitz # PyMuPDF
|
|
import os
|
|
import base64
|
|
from docx import Document
|
|
from docx2pdf import convert
|
|
from openai import OpenAI
|
|
from dotenv import load_dotenv
|
|
from openai import OpenAI
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
|
|
from loguru import logger
|
|
from langchain_openai import ChatOpenAI
|
|
from langchain_core.prompts.prompt import PromptTemplate
|
|
from dotenv import load_dotenv
|
|
import faiss
|
|
from uuid import uuid4
|
|
from langchain_community.docstore.in_memory import InMemoryDocstore
|
|
from langchain_community.vectorstores import FAISS
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import shutil
|
|
from loguru import logger
|
|
import json
|
|
load_dotenv()
|
|
|
|
|
|
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
|
|
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
|
|
llm = ChatOpenAI(model="gpt-4o")
|
|
|
|
def load_vector_store(file_path: str, embeddings) -> FAISS:
|
|
"""
|
|
Load a vector store from a local file.
|
|
|
|
Args:
|
|
- file_path (str): Path to the file where the vector store is saved.
|
|
- embeddings: The embedding function to use for loading the vector store.
|
|
|
|
Returns:
|
|
- FAISS: The loaded vector store.
|
|
"""
|
|
return FAISS.load_local(file_path, embeddings, allow_dangerous_deserialization=True)
|
|
|
|
def search_similar_documents(query: str, vector_store: FAISS, top_k: int = 5):
|
|
"""
|
|
Perform a similarity search in the vector store.
|
|
|
|
Args:
|
|
- query (str): The query string to search for.
|
|
- vector_store (FAISS): The vector store to perform the search on.
|
|
- top_k (int): The number of top similar documents to return.
|
|
|
|
Returns:
|
|
- List of tuples containing page_number and page_content of documents that are most similar to the query.
|
|
"""
|
|
results = vector_store.similarity_search(query, k=top_k)
|
|
return [(doc.metadata['page_number'], doc.page_content) for doc in results]
|
|
|
|
|
|
|
|
vec_db = load_vector_store("APA_index", embeddings)
|
|
|
|
client = OpenAI()
|
|
|
|
def pdf_to_images(pdf_path, output_folder='output_images'):
|
|
"""
|
|
Convert a PDF file to images using PyMuPDF.
|
|
|
|
Args:
|
|
- pdf_path (str): Path to the PDF file.
|
|
- output_folder (str): Folder to save the output images.
|
|
|
|
Returns:
|
|
- List of image file paths.
|
|
"""
|
|
if not os.path.exists(output_folder):
|
|
os.makedirs(output_folder)
|
|
|
|
# Open the PDF
|
|
pdf_document = fitz.open(pdf_path)
|
|
image_paths = []
|
|
|
|
# Iterate through the pages
|
|
for page_number in range(len(pdf_document)):
|
|
page = pdf_document[page_number]
|
|
pix = page.get_pixmap()
|
|
image_path = os.path.join(output_folder, f'page_{page_number + 1}.png')
|
|
pix.save(image_path)
|
|
image_paths.append(image_path)
|
|
|
|
pdf_document.close()
|
|
return image_paths
|
|
|
|
|
|
def docx_to_images(docx_path, output_folder='output_images'):
|
|
"""
|
|
Convert a DOCX file to images using an intermediate PDF conversion and PyMuPDF for rendering.
|
|
|
|
Args:
|
|
- docx_path (str): Path to the DOCX file.
|
|
- pdf_to_images_func (function): Function to convert PDF to images.
|
|
- output_folder (str): Folder to save the output images.
|
|
|
|
Returns:
|
|
- List of image file paths.
|
|
"""
|
|
if not os.path.exists(output_folder):
|
|
os.makedirs(output_folder)
|
|
|
|
# Step 1: Convert DOCX to PDF
|
|
|
|
pdf_path = os.path.splitext(docx_path)[0] + ".pdf"
|
|
convert(docx_path, pdf_path)
|
|
|
|
# Step 2: Convert the intermediate PDF to images
|
|
image_paths = pdf_to_images(pdf_path, output_folder)
|
|
|
|
return image_paths
|
|
|
|
def document_to_images(file_path):
|
|
"""
|
|
Convert a PDF or DOCX file to images.
|
|
|
|
Args:
|
|
- file_path (str): Path to the document file (PDF or DOCX).
|
|
- output_folder (str): Folder to save the output images.
|
|
- dpi (int): Resolution of the output images.
|
|
|
|
Returns:
|
|
- List of image file paths.
|
|
"""
|
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
if file_extension == '.pdf':
|
|
return pdf_to_images(file_path)
|
|
elif file_extension == '.docx':
|
|
return docx_to_images(file_path)
|
|
else:
|
|
raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")
|
|
|
|
|
|
def images_to_base64(directory_path):
|
|
"""
|
|
Convert all images in the specified directory to Base64-encoded strings.
|
|
|
|
Args:
|
|
- directory_path (str): Path to the directory containing image files.
|
|
|
|
Returns:
|
|
- List of tuples containing the image filename and its Base64-encoded string.
|
|
"""
|
|
base64_images = []
|
|
|
|
# Supported image file extensions
|
|
supported_extensions = ('.png')
|
|
|
|
# Iterate over all files in the directory
|
|
for filename in os.listdir(directory_path):
|
|
# Check if the file has a supported image extension
|
|
if filename.lower().endswith(supported_extensions):
|
|
file_path = os.path.join(directory_path, filename)
|
|
try:
|
|
with open(file_path, 'rb') as image_file:
|
|
# Read the image file and encode it to Base64
|
|
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
|
|
base64_images.append((filename, encoded_string))
|
|
except Exception as e:
|
|
print(f"Error processing file {filename}: {e}")
|
|
|
|
return base64_images
|
|
|
|
|
|
prompt = """
|
|
You are an APA Compliance and Document Review Agent, highly specialized in ensuring strict adherence to APA guidelines as defined in the "APA Publication Manual, 7th Edition" by the American Psychological Association.
|
|
|
|
Your task is to analyze the provided text or document images to identify and correct errors in the following areas:
|
|
1. **Grammatical Errors:** Identify grammatical issues, focusing on APA-specific grammar requirements (e.g., third-person writing, formal tone, active voice).
|
|
2. **Document Structure Errors:** Ensure the document adheres to APA formatting requirements, including title page layout, abstract structure, headings, and reference list organization.
|
|
3. **Referencing Errors:** Detect and correct issues with references, such as missing references, improper formatting, or inconsistencies in style.
|
|
4. **Citation Errors:** Identify problems with in-text citations, such as missing elements, improper punctuation, or placement errors.
|
|
|
|
For each page/image, provide a detailed analysis and return a structured dictionary with the following format:
|
|
|
|
{
|
|
"Page/Image": <Page number or Image identifier>,
|
|
"Errors": [
|
|
{
|
|
"Line Number(s)": <Line number(s) where the error occurs>,
|
|
"Error Text": "<Exact text of the flawed element>",
|
|
"Description of the Error": "<Detailed explanation of why it is incorrect, referencing specific pages and sections from the APA 7th Edition Manual>",
|
|
"Suggested Correction": "<The correct or improved version of the text>"
|
|
},
|
|
...
|
|
],
|
|
"Summary": "<A brief summary stating whether the page/image meets APA standards or the total number of errors detected.>"
|
|
}
|
|
|
|
**Additional Instructions:**
|
|
1. For grammar, include both generic grammatical errors and APA-specific grammar violations. Cite the relevant page and section for APA grammar standards.
|
|
2. For document structure, verify that all APA-required sections are present and correctly formatted. Reference the relevant section (e.g., "Running Head: APA 7th Edition, p. 37").
|
|
3. For citations and references, explicitly state the page number and section of the "APA Publication Manual, 7th Edition" that supports your findings.
|
|
4. Be strict and exhaustive in your evaluation, ensuring no potential flaws are overlooked.
|
|
5. Include specific and detailed descriptions that allow the user to locate the correction in the APA manual.
|
|
6. If no errors are found on a page/image, include a summary confirming adherence to APA standards and set "Errors" to an empty list.
|
|
|
|
Your response must follow this structured format and be concise, well-structured, and formatted in JSON for easy parsing.
|
|
|
|
"""
|
|
|
|
def evaluate_images_for_citations(image_directory:str):
|
|
"""
|
|
Evaluate images for APA citation errors.
|
|
|
|
Args:
|
|
- image_directory (str): Path to the directory containing image files.
|
|
|
|
Returns:
|
|
- List of dictionaries containing page/image identifiers and citation error details.
|
|
"""
|
|
# Convert images to Base64
|
|
base64_images = images_to_base64(image_directory)
|
|
|
|
results = []
|
|
|
|
for index, (filename, encoded_string) in enumerate(base64_images):
|
|
page_number = index + 1 # Assuming page numbers start from 1
|
|
response = client.chat.completions.create(
|
|
model="gpt-4o",
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": prompt,
|
|
},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": f"data:image/jpeg;base64,{encoded_string}",
|
|
},
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
response_content = response.choices[0].message.content
|
|
try:
|
|
response_content = response.choices[0].message.content[7:-3]
|
|
response_content = json.loads(response_content)
|
|
except (IndexError, json.JSONDecodeError) as e:
|
|
print(f"Error processing response content: {e}")
|
|
response_content = {}
|
|
|
|
# Create a structured dictionary for the results
|
|
result = {
|
|
"Page/Image": f"Image {page_number}",
|
|
"Errors": response_content.get("Errors", []),
|
|
"Summary": response_content.get("Summary", "No errors found.")
|
|
}
|
|
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
|
|
def extract_error_descriptions(documents):
|
|
"""
|
|
Extracts 'Description of the Error' from the list of documents.
|
|
|
|
Parameters:
|
|
documents (list): A list of dictionaries containing page information and errors.
|
|
|
|
Returns:
|
|
list: A list of all 'Description of the Error' values in the order they appear.
|
|
"""
|
|
descriptions = []
|
|
|
|
for document in documents:
|
|
# Get the list of errors for the current document
|
|
errors = document.get("Errors", [])
|
|
|
|
# Extract the "Description of the Error" for each error
|
|
for error in errors:
|
|
description = error.get("Description of the Error")
|
|
if description: # Ensure the description exists
|
|
descriptions.append(description)
|
|
|
|
return descriptions
|
|
|
|
|
|
|
|
def extract_error_texts(documents):
|
|
"""
|
|
Extracts 'Error Text' from the list of documents.
|
|
|
|
Parameters:
|
|
documents (list): A list of dictionaries containing page information and errors.
|
|
|
|
Returns:
|
|
list: A list of all 'Error Text' values in the order they appear.
|
|
"""
|
|
error_texts = []
|
|
|
|
for document in documents:
|
|
# Get the list of errors for the current document
|
|
errors = document.get("Errors", [])
|
|
|
|
# Extract the "Error Text" for each error
|
|
for error in errors:
|
|
error_text = error.get("Error Text")
|
|
if error_text: # Ensure the error text exists
|
|
error_texts.append(error_text)
|
|
|
|
return error_texts
|
|
|
|
|
|
def extract_errors_minimal(doc_errors):
|
|
"""
|
|
Extracts individual errors from the document errors list and converts
|
|
them into a flat list of dictionaries with essential details only.
|
|
|
|
Args:
|
|
- doc_errors (list): A list of dictionaries representing document pages with errors.
|
|
|
|
Returns:
|
|
- list: A flat list of dictionaries, each representing an individual error
|
|
with minimal details (Doc Page, Line Number(s), Error Text).
|
|
"""
|
|
flat_errors = []
|
|
|
|
for page_data in doc_errors:
|
|
# Extract the page identifier
|
|
page = page_data.get("Page/Image", "Unknown Page")
|
|
|
|
# Loop through all errors on this page
|
|
for error in page_data.get("Errors", []):
|
|
# Create a dictionary for each error with minimal details
|
|
flat_error = {
|
|
"Doc Page": page,
|
|
"Line Number(s)": error.get("Line Number(s)", "Unknown Line"),
|
|
"Error Text": error.get("Error Text", "No Error Text"),
|
|
}
|
|
# Add the error to the flat list
|
|
flat_errors.append(flat_error)
|
|
|
|
return flat_errors
|
|
|
|
|
|
|
|
def get_similar_documents(descriptions, vec_db, k=10):
|
|
logger.info(f"Getting Similar Documents")
|
|
"""
|
|
Retrieve similar documents from the vector database for each description.
|
|
|
|
Args:
|
|
- descriptions (list): A list of descriptions to find similar documents for.
|
|
- vec_db: The vector database instance to query.
|
|
- k (int): The number of similar documents to retrieve for each description.
|
|
|
|
Returns:
|
|
- list: A list of lists containing the page number and page content of similar documents for each description.
|
|
"""
|
|
def fetch_similar_docs(description):
|
|
# Use the description to search for similar documents in the vector database
|
|
similar_docs = search_similar_documents(description, vec_db, k)
|
|
return [(doc[0], doc[1]) for doc in similar_docs]
|
|
|
|
similar_documents_content = []
|
|
with ThreadPoolExecutor() as executor:
|
|
future_to_description = {executor.submit(fetch_similar_docs, desc): desc for desc in descriptions}
|
|
|
|
for future in as_completed(future_to_description):
|
|
try:
|
|
page_info = future.result()
|
|
similar_documents_content.append(page_info)
|
|
except Exception as e:
|
|
logger.error(f"Error processing description: {e}")
|
|
|
|
return similar_documents_content
|
|
|
|
|
|
def delete_folder(folder_path):
|
|
"""
|
|
Deletes a folder and all its contents.
|
|
|
|
Args:
|
|
- folder_path (str): The path to the folder to be deleted.
|
|
|
|
Returns:
|
|
- bool: True if the folder was successfully deleted, False otherwise.
|
|
"""
|
|
try:
|
|
if os.path.exists(folder_path):
|
|
shutil.rmtree(folder_path)
|
|
print(f"Folder '{folder_path}' deleted successfully.")
|
|
return True
|
|
else:
|
|
print(f"Folder '{folder_path}' does not exist.")
|
|
return False
|
|
except Exception as e:
|
|
print(f"An error occurred while deleting the folder: {e}")
|
|
return False
|
|
|
|
|
|
def identify_reference(error:str, error_description:str, reference_pages:list) -> dict:
|
|
logger.info(f"Identifying Reference Pages ")
|
|
initiator_prompt = PromptTemplate(
|
|
template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
You are an Advanced Document Analysis AI Agent. You are very good with understanding books and identifying the right one.
|
|
You are assigned a task of selecting the right reference book for a Grammatical and Citation error that occured in a document.
|
|
You are provided with the following information:
|
|
1. Identified error.
|
|
2. The description of the error.
|
|
3. A list of tuples of the reference document's page and content.
|
|
Note: The reference document is a text book that talks about APA citation, the book is "APA Publication Manual, 7th Edition".
|
|
Your task is to do the following:
|
|
1. Understand the provided information.
|
|
2. Identify the right document that is the properly document reference that talks about the error and error description.
|
|
3. You might want to reference multiple contents in the shared documents, but it must be the same page.
|
|
4. Identify the page and the reference statement (might be a combo of multiple statements).
|
|
5. Generate a correction explanation to that error based on the reference document.
|
|
6. Generate the corrected version of the error
|
|
|
|
So after that you want to prepare a JSON that has the following details:
|
|
1. reference_page: The identified page as seen above.
|
|
2. content: The content that speaks about the error and how to fix it as seen the identified page in the reference document.
|
|
3. correction_explanation: The explanation correction to the error as mentioned in the referenced page.
|
|
4. correction: The correct version of the error.
|
|
|
|
Verify the following output:
|
|
1. reference_page.
|
|
2. content.
|
|
3. correction_explanation.
|
|
4. correction.
|
|
|
|
Please make sure they are there.
|
|
It should always come out this way.
|
|
|
|
Lastly, the JSON structure is very important.
|
|
|
|
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
|
Error: {error}\n
|
|
Error_Description: {error_description}\n
|
|
Reference_Pages: {reference_pages}\n
|
|
|
|
<|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
|
|
input_variables=["error", "error_description", "reference_pages"],
|
|
)
|
|
|
|
initiator_router = initiator_prompt | llm | JsonOutputParser()
|
|
output = initiator_router.invoke({"error":error, "error_description":error_description, "reference_pages":reference_pages})
|
|
return output
|
|
|
|
|
|
def process_document_errors(doc_errors, vec_db=vec_db, k=10):
|
|
"""
|
|
Processes document errors by extracting descriptions and errors,
|
|
identifying references, and updating the error details with corrections.
|
|
|
|
Args:
|
|
- doc_errors (list): A list of dictionaries containing errors from the document.
|
|
- vec_db: The vector database instance to query.
|
|
- k (int): The number of similar documents to retrieve for each description.
|
|
|
|
Returns:
|
|
- list: Updated list of errors with reference details added.
|
|
"""
|
|
updated_errors = []
|
|
|
|
# Extract error texts and descriptions using the provided utility functions
|
|
error_texts = extract_error_texts(doc_errors)
|
|
error_description = extract_error_descriptions(doc_errors)
|
|
flattened_errors = extract_errors_minimal(doc_errors)
|
|
|
|
# Get similar documents for the descriptions
|
|
reference_pages = get_similar_documents(error_description, vec_db, k)
|
|
|
|
# Loop through each error to identify references
|
|
for i, error in enumerate(flattened_errors):
|
|
# Call the identify_reference function to get the reference details
|
|
reference_info = identify_reference(error_texts[i], error_description[i], reference_pages[i])
|
|
print(f"reference_info: {reference_info}")
|
|
# Update the error dictionary with the reference information
|
|
error.update({
|
|
"reference_page": reference_info['reference_page'],
|
|
"reference_page_content": reference_info['content'],
|
|
"correction_explanation": reference_info['correction_explanation'],
|
|
"correction": reference_info['correction'],
|
|
})
|
|
|
|
updated_errors.append(error)
|
|
|
|
return updated_errors |