ds_citationpro/utils.py

import fitz  # PyMuPDF
import os
import base64
from docx import Document
from docx2pdf import convert
from openai import OpenAI
from dotenv import load_dotenv
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from loguru import logger
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
from dotenv import load_dotenv
import faiss
from uuid import uuid4
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from concurrent.futures import ThreadPoolExecutor, as_completed
import shutil
from loguru import logger
import json
load_dotenv()


os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
llm = ChatOpenAI(model="gpt-4o")

def load_vector_store(file_path: str, embeddings) -> FAISS:
    """
    Load a vector store from a local file.

    Args:
    - file_path (str): Path to the file where the vector store is saved.
    - embeddings: The embedding function to use for loading the vector store.

    Returns:
    - FAISS: The loaded vector store.
    """
    return FAISS.load_local(file_path, embeddings, allow_dangerous_deserialization=True)

def search_similar_documents(query: str, vector_store: FAISS, top_k: int = 5):
    """
    Perform a similarity search in the vector store.

    Args:
    - query (str): The query string to search for.
    - vector_store (FAISS): The vector store to perform the search on.
    - top_k (int): The number of top similar documents to return.

    Returns:
    - List of tuples containing page_number and page_content of documents that are most similar to the query.
    """
    results = vector_store.similarity_search(query, k=top_k)
    return [(doc.metadata['page_number'], doc.page_content) for doc in results]


vec_db = load_vector_store("APA_index", embeddings)

client = OpenAI()

def pdf_to_images(pdf_path, output_folder='output_images'):
    """
    Convert a PDF file to images using PyMuPDF.

    Args:
    - pdf_path (str): Path to the PDF file.
    - output_folder (str): Folder to save the output images.

    Returns:
    - List of image file paths.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    # Iterate through the pages
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        pix = page.get_pixmap()
        image_path = os.path.join(output_folder, f'page_{page_number + 1}.png')
        pix.save(image_path)
        image_paths.append(image_path)

    pdf_document.close()
    return image_paths


def docx_to_images(docx_path, output_folder='output_images'):
    """
    Convert a DOCX file to images using an intermediate PDF conversion and PyMuPDF for rendering.

    Args:
    - docx_path (str): Path to the DOCX file.
    - pdf_to_images_func (function): Function to convert PDF to images.
    - output_folder (str): Folder to save the output images.

    Returns:
    - List of image file paths.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Step 1: Convert DOCX to PDF

    pdf_path = os.path.splitext(docx_path)[0] + ".pdf"
    convert(docx_path, pdf_path)

    # Step 2: Convert the intermediate PDF to images
    image_paths = pdf_to_images(pdf_path, output_folder)

    return image_paths

def document_to_images(file_path):
    """
    Convert a PDF or DOCX file to images.

    Args:
    - file_path (str): Path to the document file (PDF or DOCX).
    - output_folder (str): Folder to save the output images.
    - dpi (int): Resolution of the output images.

    Returns:
    - List of image file paths.
    """
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.pdf':
        return pdf_to_images(file_path)
    elif file_extension == '.docx':
        return docx_to_images(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")


def images_to_base64(directory_path):
    """
    Convert all images in the specified directory to Base64-encoded strings.

    Args:
    - directory_path (str): Path to the directory containing image files.

    Returns:
    - List of tuples containing the image filename and its Base64-encoded string.
    """
    base64_images = []

    # Supported image file extensions
    supported_extensions = ('.png')

    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        # Check if the file has a supported image extension
        if filename.lower().endswith(supported_extensions):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'rb') as image_file:
                    # Read the image file and encode it to Base64
                    encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
                    base64_images.append((filename, encoded_string))
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    return base64_images


prompt = """
You are an APA Compliance and Document Review Agent, highly specialized in ensuring strict adherence to APA guidelines as defined in the "APA Publication Manual, 7th Edition" by the American Psychological Association.

Your task is to analyze the provided text or document images to identify and correct errors in the following areas:
1. **Grammatical Errors:** Identify grammatical issues, focusing on APA-specific grammar requirements (e.g., third-person writing, formal tone, active voice).
2. **Document Structure Errors:** Ensure the document adheres to APA formatting requirements, including title page layout, abstract structure, headings, and reference list organization.
3. **Referencing Errors:** Detect and correct issues with references, such as missing references, improper formatting, or inconsistencies in style.
4. **Citation Errors:** Identify problems with in-text citations, such as missing elements, improper punctuation, or placement errors.

For each page/image, provide a detailed analysis and return a structured dictionary with the following format:

{
    "Page/Image": <Page number or Image identifier>,
    "Errors": [
        {
            "Line Number(s)": <Line number(s) where the error occurs>,
            "Error Text": "<Exact text of the flawed element>",
            "Description of the Error": "<Detailed explanation of why it is incorrect, referencing specific pages and sections from the APA 7th Edition Manual>",
            "Suggested Correction": "<The correct or improved version of the text>"
        },
        ...
    ],
    "Summary": "<A brief summary stating whether the page/image meets APA standards or the total number of errors detected.>"
}

**Additional Instructions:**
1. For grammar, include both generic grammatical errors and APA-specific grammar violations. Cite the relevant page and section for APA grammar standards.
2. For document structure, verify that all APA-required sections are present and correctly formatted. Reference the relevant section (e.g., "Running Head: APA 7th Edition, p. 37").
3. For citations and references, explicitly state the page number and section of the "APA Publication Manual, 7th Edition" that supports your findings.
4. Be strict and exhaustive in your evaluation, ensuring no potential flaws are overlooked.
5. Include specific and detailed descriptions that allow the user to locate the correction in the APA manual.
6. If no errors are found on a page/image, include a summary confirming adherence to APA standards and set "Errors" to an empty list.

Your response must follow this structured format and be concise, well-structured, and formatted in JSON for easy parsing.

"""

def evaluate_images_for_citations(image_directory:str):
    """
    Evaluate images for APA citation errors.

    Args:
    - image_directory (str): Path to the directory containing image files.

    Returns:
    - List of dictionaries containing page/image identifiers and citation error details.
    """
    # Convert images to Base64
    base64_images = images_to_base64(image_directory)

    results = []

    for index, (filename, encoded_string) in enumerate(base64_images):
        page_number = index + 1  # Assuming page numbers start from 1
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt,
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{encoded_string}",
                            },
                        },
                    ],
                }
            ],
        )

        response_content = response.choices[0].message.content
        try:
            response_content = response.choices[0].message.content[7:-3]
            response_content = json.loads(response_content)
        except (IndexError, json.JSONDecodeError) as e:
            print(f"Error processing response content: {e}")
            response_content = {}

        # Create a structured dictionary for the results
        result = {
            "Page/Image": f"Image {page_number}",
            "Errors": response_content.get("Errors", []),
            "Summary": response_content.get("Summary", "No errors found.")
        }

        results.append(result)

    return results


def extract_error_descriptions(documents):
    """
    Extracts 'Description of the Error' from the list of documents.

    Parameters:
    documents (list): A list of dictionaries containing page information and errors.

    Returns:
    list: A list of all 'Description of the Error' values in the order they appear.
    """
    descriptions = []

    for document in documents:
        # Get the list of errors for the current document
        errors = document.get("Errors", [])

        # Extract the "Description of the Error" for each error
        for error in errors:
            description = error.get("Description of the Error")
            if description:  # Ensure the description exists
                descriptions.append(description)

    return descriptions


def extract_error_texts(documents):
    """
    Extracts 'Error Text' from the list of documents.

    Parameters:
    documents (list): A list of dictionaries containing page information and errors.

    Returns:
    list: A list of all 'Error Text' values in the order they appear.
    """
    error_texts = []

    for document in documents:
        # Get the list of errors for the current document
        errors = document.get("Errors", [])

        # Extract the "Error Text" for each error
        for error in errors:
            error_text = error.get("Error Text")
            if error_text:  # Ensure the error text exists
                error_texts.append(error_text)

    return error_texts


def extract_errors_minimal(doc_errors):
    """
    Extracts individual errors from the document errors list and converts
    them into a flat list of dictionaries with essential details only.

    Args:
    - doc_errors (list): A list of dictionaries representing document pages with errors.

    Returns:
    - list: A flat list of dictionaries, each representing an individual error
            with minimal details (Doc Page, Line Number(s), Error Text).
    """
    flat_errors = []

    for page_data in doc_errors:
        # Extract the page identifier
        page = page_data.get("Page/Image", "Unknown Page")

        # Loop through all errors on this page
        for error in page_data.get("Errors", []):
            # Create a dictionary for each error with minimal details
            flat_error = {
                "Doc Page": page,
                "Line Number(s)": error.get("Line Number(s)", "Unknown Line"),
                "Error Text": error.get("Error Text", "No Error Text"),
            }
            # Add the error to the flat list
            flat_errors.append(flat_error)

    return flat_errors


def get_similar_documents(descriptions, vec_db, k=10):
    logger.info(f"Getting Similar Documents")
    """
    Retrieve similar documents from the vector database for each description.

    Args:
    - descriptions (list): A list of descriptions to find similar documents for.
    - vec_db: The vector database instance to query.
    - k (int): The number of similar documents to retrieve for each description.

    Returns:
    - list: A list of lists containing the page number and page content of similar documents for each description.
    """
    def fetch_similar_docs(description):
        # Use the description to search for similar documents in the vector database
        similar_docs = search_similar_documents(description, vec_db, k)
        return [(doc[0], doc[1]) for doc in similar_docs]

    similar_documents_content = []
    with ThreadPoolExecutor() as executor:
        future_to_description = {executor.submit(fetch_similar_docs, desc): desc for desc in descriptions}

        for future in as_completed(future_to_description):
            try:
                page_info = future.result()
                similar_documents_content.append(page_info)
            except Exception as e:
                logger.error(f"Error processing description: {e}")

    return similar_documents_content


def delete_folder(folder_path):
    """
    Deletes a folder and all its contents.

    Args:
    - folder_path (str): The path to the folder to be deleted.

    Returns:
    - bool: True if the folder was successfully deleted, False otherwise.
    """
    try:
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
            print(f"Folder '{folder_path}' deleted successfully.")
            return True
        else:
            print(f"Folder '{folder_path}' does not exist.")
            return False
    except Exception as e:
        print(f"An error occurred while deleting the folder: {e}")
        return False


def identify_reference(error:str, error_description:str, reference_pages:list) -> dict:
    logger.info(f"Identifying Reference Pages ")
    initiator_prompt = PromptTemplate(
        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are an Advanced Document Analysis AI Agent. You are very good with understanding books and identifying the right one.
        You are assigned a task of selecting the right reference book for a Grammatical and Citation error that occured in a document.
        You are provided with the following information:
            1. Identified error.
            2. The description of the error.
            3. A list of tuples of the reference document's page and content.
        Note: The reference document is a text book that talks about APA citation, the book is "APA Publication Manual, 7th Edition".
        Your task is to do the following:
            1. Understand the provided information.
            2. Identify the right document that is the properly document reference that talks about the error and error description.
            3. You might want to reference multiple contents in the shared documents, but it must be the same page.
            4. Identify the page and the reference statement (might be a combo of multiple statements).
            5. Generate a correction explanation to that error based on the reference document.
            6. Generate the corrected version of the error

        So after that you want to prepare a JSON that has the following details:
            1. reference_page: The identified page as seen above.
            2. content: The content that speaks about the error and how to fix it as seen the identified page in  the reference document.
            3. correction_explanation: The explanation correction to the error as mentioned in the referenced page.
            4. correction: The correct version of the error.

        Verify the following output:
            1. reference_page.
            2. content.
            3. correction_explanation.
            4. correction.

        Please make sure they are there.
        It should always come out this way.

        Lastly, the JSON structure is very important.

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Error: {error}\n
    Error_Description: {error_description}\n
    Reference_Pages: {reference_pages}\n

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
            input_variables=["error", "error_description", "reference_pages"],
        )

    initiator_router = initiator_prompt | llm | JsonOutputParser()
    output = initiator_router.invoke({"error":error, "error_description":error_description, "reference_pages":reference_pages})
    return output


def process_document_errors(doc_errors, vec_db=vec_db, k=10):
    """
    Processes document errors by extracting descriptions and errors,
    identifying references, and updating the error details with corrections.

    Args:
    - doc_errors (list): A list of dictionaries containing errors from the document.
    - vec_db: The vector database instance to query.
    - k (int): The number of similar documents to retrieve for each description.

    Returns:
    - list: Updated list of errors with reference details added.
    """
    updated_errors = []

    # Extract error texts and descriptions using the provided utility functions
    error_texts = extract_error_texts(doc_errors)
    error_description = extract_error_descriptions(doc_errors)
    flattened_errors = extract_errors_minimal(doc_errors)

    # Get similar documents for the descriptions
    reference_pages = get_similar_documents(error_description, vec_db, k)

    # Loop through each error to identify references
    for i, error in enumerate(flattened_errors):
        # Call the identify_reference function to get the reference details
        reference_info = identify_reference(error_texts[i], error_description[i], reference_pages[i])
        print(f"reference_info: {reference_info}")
        # Update the error dictionary with the reference information
        error.update({
            "reference_page": reference_info['reference_page'],
            "reference_page_content": reference_info['content'],
            "correction_explanation": reference_info['correction_explanation'],
            "correction": reference_info['correction'],
        })

        updated_errors.append(error)

    return updated_errors