Update matching logic: AI scores all candidates, lower threshold, absolute amount, prompt improvements

This commit is contained in:
Iyeoluwa Akinrinola
2025-07-02 16:38:01 +01:00
commit a519c42866
10641 changed files with 3944174 additions and 0 deletions
@@ -0,0 +1,144 @@
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Implementation of generic PDF objects (dictionary, number, string, ...)."""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
from typing import Dict, List, Union
from .._utils import StreamType, deprecate_with_replacement
from ..constants import OutlineFontFlag
from ._annotations import AnnotationBuilder
from ._base import (
BooleanObject,
ByteStringObject,
FloatObject,
IndirectObject,
NameObject,
NullObject,
NumberObject,
PdfObject,
TextStringObject,
encode_pdfdocencoding,
)
from ._data_structures import (
ArrayObject,
ContentStream,
DecodedStreamObject,
Destination,
DictionaryObject,
EncodedStreamObject,
Field,
StreamObject,
TreeObject,
read_object,
)
from ._fit import Fit
from ._outline import Bookmark, OutlineItem
from ._rectangle import RectangleObject
from ._utils import (
create_string_object,
decode_pdfdocencoding,
hex_to_rgb,
read_hex_string_from_stream,
read_string_from_stream,
)
def readHexStringFromStream(
stream: StreamType,
) -> Union["TextStringObject", "ByteStringObject"]: # pragma: no cover
deprecate_with_replacement(
"readHexStringFromStream", "read_hex_string_from_stream", "4.0.0"
)
return read_hex_string_from_stream(stream)
def readStringFromStream(
stream: StreamType,
forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]: # pragma: no cover
deprecate_with_replacement(
"readStringFromStream", "read_string_from_stream", "4.0.0"
)
return read_string_from_stream(stream, forced_encoding)
def createStringObject(
string: Union[str, bytes],
forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union[TextStringObject, ByteStringObject]: # pragma: no cover
deprecate_with_replacement("createStringObject", "create_string_object", "4.0.0")
return create_string_object(string, forced_encoding)
PAGE_FIT = Fit.fit()
__all__ = [
# Base types
"BooleanObject",
"FloatObject",
"NumberObject",
"NameObject",
"IndirectObject",
"NullObject",
"PdfObject",
"TextStringObject",
"ByteStringObject",
# Annotations
"AnnotationBuilder",
# Fit
"Fit",
"PAGE_FIT",
# Data structures
"ArrayObject",
"DictionaryObject",
"TreeObject",
"StreamObject",
"DecodedStreamObject",
"EncodedStreamObject",
"ContentStream",
"RectangleObject",
"Field",
"Destination",
# --- More specific stuff
# Outline
"OutlineItem",
"OutlineFontFlag",
"Bookmark",
# Data structures core functions
"read_object",
# Utility functions
"create_string_object",
"encode_pdfdocencoding",
"decode_pdfdocencoding",
"hex_to_rgb",
"read_hex_string_from_stream",
"read_string_from_stream",
]
@@ -0,0 +1,275 @@
from typing import Optional, Tuple, Union
from ._base import (
BooleanObject,
FloatObject,
NameObject,
NumberObject,
TextStringObject,
)
from ._data_structures import ArrayObject, DictionaryObject
from ._fit import DEFAULT_FIT, Fit
from ._rectangle import RectangleObject
from ._utils import hex_to_rgb
class AnnotationBuilder:
"""
The AnnotationBuilder creates dictionaries representing PDF annotations.
Those dictionaries can be modified before they are added to a PdfWriter
instance via `writer.add_annotation`.
See `adding PDF annotations <../user/adding-pdf-annotations.html>`_ for
it's usage combined with PdfWriter.
"""
from ..types import FitType, ZoomArgType
@staticmethod
def text(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
text: str,
open: bool = False,
flags: int = 0,
) -> DictionaryObject:
"""
Add text annotation.
:param Tuple[int, int, int, int] rect:
or array of four integers specifying the clickable rectangular area
``[xLL, yLL, xUR, yUR]``
:param bool open:
:param int flags:
"""
# TABLE 8.23 Additional entries specific to a text annotation
text_obj = DictionaryObject(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Text"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/Contents"): TextStringObject(text),
NameObject("/Open"): BooleanObject(open),
NameObject("/Flags"): NumberObject(flags),
}
)
return text_obj
@staticmethod
def free_text(
text: str,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
font: str = "Helvetica",
bold: bool = False,
italic: bool = False,
font_size: str = "14pt",
font_color: str = "000000",
border_color: str = "000000",
background_color: str = "ffffff",
) -> DictionaryObject:
"""
Add text in a rectangle to a page.
:param str text: Text to be added
:param RectangleObject rect: or array of four integers
specifying the clickable rectangular area ``[xLL, yLL, xUR, yUR]``
:param str font: Name of the Font, e.g. 'Helvetica'
:param bool bold: Print the text in bold
:param bool italic: Print the text in italic
:param str font_size: How big the text will be, e.g. '14pt'
:param str font_color: Hex-string for the color
:param str border_color: Hex-string for the border color
:param str background_color: Hex-string for the background of the annotation
"""
font_str = "font: "
if bold is True:
font_str = font_str + "bold "
if italic is True:
font_str = font_str + "italic "
font_str = font_str + font + " " + font_size
font_str = font_str + ";text-align:left;color:#" + font_color
bg_color_str = ""
for st in hex_to_rgb(border_color):
bg_color_str = bg_color_str + str(st) + " "
bg_color_str = bg_color_str + "rg"
free_text = DictionaryObject()
free_text.update(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/FreeText"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/Contents"): TextStringObject(text),
# font size color
NameObject("/DS"): TextStringObject(font_str),
# border color
NameObject("/DA"): TextStringObject(bg_color_str),
# background color
NameObject("/C"): ArrayObject(
[FloatObject(n) for n in hex_to_rgb(background_color)]
),
}
)
return free_text
@staticmethod
def line(
p1: Tuple[float, float],
p2: Tuple[float, float],
rect: Union[RectangleObject, Tuple[float, float, float, float]],
text: str = "",
title_bar: str = "",
) -> DictionaryObject:
"""
Draw a line on the PDF.
:param Tuple[float, float] p1: First point
:param Tuple[float, float] p2: Second point
:param RectangleObject rect: or array of four
integers specifying the clickable rectangular area
``[xLL, yLL, xUR, yUR]``
:param str text: Text to be displayed as the line annotation
:param str title_bar: Text to be displayed in the title bar of the
annotation; by convention this is the name of the author
"""
line_obj = DictionaryObject(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Line"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/T"): TextStringObject(title_bar),
NameObject("/L"): ArrayObject(
[
FloatObject(p1[0]),
FloatObject(p1[1]),
FloatObject(p2[0]),
FloatObject(p2[1]),
]
),
NameObject("/LE"): ArrayObject(
[
NameObject(None),
NameObject(None),
]
),
NameObject("/IC"): ArrayObject(
[
FloatObject(0.5),
FloatObject(0.5),
FloatObject(0.5),
]
),
NameObject("/Contents"): TextStringObject(text),
}
)
return line_obj
@staticmethod
def rectangle(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
interiour_color: Optional[str] = None,
) -> DictionaryObject:
"""
Draw a rectangle on the PDF.
:param RectangleObject rect: or array of four
integers specifying the clickable rectangular area
``[xLL, yLL, xUR, yUR]``
"""
square_obj = DictionaryObject(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Square"),
NameObject("/Rect"): RectangleObject(rect),
}
)
if interiour_color:
square_obj[NameObject("/IC")] = ArrayObject(
[FloatObject(n) for n in hex_to_rgb(interiour_color)]
)
return square_obj
@staticmethod
def link(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
border: Optional[ArrayObject] = None,
url: Optional[str] = None,
target_page_index: Optional[int] = None,
fit: Fit = DEFAULT_FIT,
) -> DictionaryObject:
"""
Add a link to the document.
The link can either be an external link or an internal link.
An external link requires the URL parameter.
An internal link requires the target_page_index, fit, and fit args.
:param RectangleObject rect: or array of four
integers specifying the clickable rectangular area
``[xLL, yLL, xUR, yUR]``
:param border: if provided, an array describing border-drawing
properties. See the PDF spec for details. No border will be
drawn if this argument is omitted.
- horizontal corner radius,
- vertical corner radius, and
- border width
- Optionally: Dash
:param str url: Link to a website (if you want to make an external link)
:param int target_page_index: index of the page to which the link should go
(if you want to make an internal link)
:param Fit fit: Page fit or 'zoom' option.
"""
from ..types import BorderArrayType
is_external = url is not None
is_internal = target_page_index is not None
if not is_external and not is_internal:
raise ValueError(
"Either 'url' or 'target_page_index' have to be provided. Both were None."
)
if is_external and is_internal:
raise ValueError(
f"Either 'url' or 'target_page_index' have to be provided. url={url}, target_page_index={target_page_index}"
)
border_arr: BorderArrayType
if border is not None:
border_arr = [NameObject(n) for n in border[:3]]
if len(border) == 4:
dash_pattern = ArrayObject([NameObject(n) for n in border[3]])
border_arr.append(dash_pattern)
else:
border_arr = [NumberObject(0)] * 3
link_obj = DictionaryObject(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/Border"): ArrayObject(border_arr),
}
)
if is_external:
link_obj[NameObject("/A")] = DictionaryObject(
{
NameObject("/S"): NameObject("/URI"),
NameObject("/Type"): NameObject("/Action"),
NameObject("/URI"): TextStringObject(url),
}
)
if is_internal:
# This needs to be updated later!
dest_deferred = DictionaryObject(
{
"target_page_index": NumberObject(target_page_index),
"fit": NameObject(fit.fit_type),
"fit_args": fit.fit_args,
}
)
link_obj[NameObject("/Dest")] = dest_deferred
return link_obj
@@ -0,0 +1,648 @@
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import codecs
import decimal
import hashlib
import re
from binascii import unhexlify
from typing import Any, Callable, List, Optional, Tuple, Union, cast
from .._codecs import _pdfdoc_encoding_rev
from .._protocols import PdfObjectProtocol, PdfWriterProtocol
from .._utils import (
StreamType,
b_,
deprecation_with_replacement,
hex_str,
hexencode,
logger_warning,
read_non_whitespace,
read_until_regex,
str_,
)
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
class PdfObject(PdfObjectProtocol):
# function for calculating a hash value
hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
indirect_reference: Optional["IndirectObject"]
def hash_value_data(self) -> bytes:
return ("%s" % self).encode()
def hash_value(self) -> bytes:
return (
"%s:%s"
% (
self.__class__.__name__,
self.hash_func(self.hash_value_data()).hexdigest(),
)
).encode()
def clone(
self,
pdf_dest: PdfWriterProtocol,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "PdfObject":
"""
clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
force_duplicate: in standard if the object has been already cloned and reference,
the copy is returned; when force_duplicate == True, a new copy is always performed
ignore_fields : list/tuple of Fields names (for dictionaries that will be ignored during cloning (apply also to childs duplication)
in standard, clone function call _reference_clone (see _reference)
"""
raise Exception("clone PdfObject")
def _reference_clone(
self, clone: Any, pdf_dest: PdfWriterProtocol
) -> PdfObjectProtocol:
"""
reference the object within the _objects of pdf_dest only if
indirect_reference attribute exists (which means the objects
was already identified in xref/xobjstm)
if object has been already referenced do nothing
"""
try:
if clone.indirect_reference.pdf == pdf_dest:
return clone
except Exception:
pass
if hasattr(self, "indirect_reference"):
ind = self.indirect_reference
i = len(pdf_dest._objects) + 1
if ind is not None:
if id(ind.pdf) not in pdf_dest._id_translated:
pdf_dest._id_translated[id(ind.pdf)] = {}
if ind.idnum in pdf_dest._id_translated[id(ind.pdf)]:
obj = pdf_dest.get_object(
pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
)
assert obj is not None
return obj
pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
pdf_dest._objects.append(clone)
clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
return clone
def get_object(self) -> Optional["PdfObject"]:
"""Resolve indirect references."""
return self
def getObject(self) -> Optional["PdfObject"]: # pragma: no cover
deprecation_with_replacement("getObject", "get_object", "3.0.0")
return self.get_object()
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
raise NotImplementedError
class NullObject(PdfObject):
def clone(
self,
pdf_dest: PdfWriterProtocol,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "NullObject":
"""clone object into pdf_dest"""
return cast("NullObject", self._reference_clone(NullObject(), pdf_dest))
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(b"null")
@staticmethod
def read_from_stream(stream: StreamType) -> "NullObject":
nulltxt = stream.read(4)
if nulltxt != b"null":
raise PdfReadError("Could not read Null object")
return NullObject()
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
def __repr__(self) -> str:
return "NullObject"
@staticmethod
def readFromStream(stream: StreamType) -> "NullObject": # pragma: no cover
deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0")
return NullObject.read_from_stream(stream)
class BooleanObject(PdfObject):
def __init__(self, value: Any) -> None:
self.value = value
def clone(
self,
pdf_dest: PdfWriterProtocol,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "BooleanObject":
"""clone object into pdf_dest"""
return cast(
"BooleanObject", self._reference_clone(BooleanObject(self.value), pdf_dest)
)
def __eq__(self, __o: object) -> bool:
if isinstance(__o, BooleanObject):
return self.value == __o.value
elif isinstance(__o, bool):
return self.value == __o
else:
return False
def __repr__(self) -> str:
return "True" if self.value else "False"
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
if self.value:
stream.write(b"true")
else:
stream.write(b"false")
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
@staticmethod
def read_from_stream(stream: StreamType) -> "BooleanObject":
word = stream.read(4)
if word == b"true":
return BooleanObject(True)
elif word == b"fals":
stream.read(1)
return BooleanObject(False)
else:
raise PdfReadError("Could not read Boolean object")
@staticmethod
def readFromStream(stream: StreamType) -> "BooleanObject": # pragma: no cover
deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0")
return BooleanObject.read_from_stream(stream)
class IndirectObject(PdfObject):
def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
self.idnum = idnum
self.generation = generation
self.pdf = pdf
def clone(
self,
pdf_dest: PdfWriterProtocol,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "IndirectObject":
"""clone object into pdf_dest"""
if self.pdf == pdf_dest and not force_duplicate:
# Already duplicated and no extra duplication required
return self
if id(self.pdf) not in pdf_dest._id_translated:
pdf_dest._id_translated[id(self.pdf)] = {}
if not force_duplicate and self.idnum in pdf_dest._id_translated[id(self.pdf)]:
dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
else:
obj = self.get_object()
assert obj is not None
dup = obj.clone(pdf_dest, force_duplicate, ignore_fields)
assert dup is not None
assert dup.indirect_reference is not None
return dup.indirect_reference
@property
def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
return self
def get_object(self) -> Optional["PdfObject"]:
obj = self.pdf.get_object(self)
if obj is None:
return None
return obj.get_object()
def __repr__(self) -> str:
return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
def __eq__(self, other: Any) -> bool:
return (
other is not None
and isinstance(other, IndirectObject)
and self.idnum == other.idnum
and self.generation == other.generation
and self.pdf is other.pdf
)
def __ne__(self, other: Any) -> bool:
return not self.__eq__(other)
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(b_(f"{self.idnum} {self.generation} R"))
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
@staticmethod
def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
idnum = b""
while True:
tok = stream.read(1)
if not tok:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
if tok.isspace():
break
idnum += tok
generation = b""
while True:
tok = stream.read(1)
if not tok:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
if tok.isspace():
if not generation:
continue
break
generation += tok
r = read_non_whitespace(stream)
if r != b"R":
raise PdfReadError(
f"Error reading indirect object reference at byte {hex_str(stream.tell())}"
)
return IndirectObject(int(idnum), int(generation), pdf)
@staticmethod
def readFromStream(
stream: StreamType, pdf: Any # PdfReader
) -> "IndirectObject": # pragma: no cover
deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0")
return IndirectObject.read_from_stream(stream, pdf)
class FloatObject(decimal.Decimal, PdfObject):
def __new__(
cls, value: Union[str, Any] = "0", context: Optional[Any] = None
) -> "FloatObject":
try:
return decimal.Decimal.__new__(cls, str_(value), context)
except Exception:
# If this isn't a valid decimal (happens in malformed PDFs)
# fallback to 0
logger_warning(f"FloatObject ({value}) invalid; use 0.0 instead", __name__)
return decimal.Decimal.__new__(cls, "0.0")
def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "FloatObject":
"""clone object into pdf_dest"""
return cast("FloatObject", self._reference_clone(FloatObject(self), pdf_dest))
def __repr__(self) -> str:
if self == self.to_integral():
# If this is an integer, format it with no decimal place.
return str(self.quantize(decimal.Decimal(1)))
else:
# Otherwise, format it with a decimal place, taking care to
# remove any extraneous trailing zeros.
return f"{self:f}".rstrip("0")
def as_numeric(self) -> float:
return float(repr(self).encode("utf8"))
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(repr(self).encode("utf8"))
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
class NumberObject(int, PdfObject):
NumberPattern = re.compile(b"[^+-.0-9]")
def __new__(cls, value: Any) -> "NumberObject":
try:
return int.__new__(cls, int(value))
except ValueError:
logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
return int.__new__(cls, 0)
def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "NumberObject":
"""clone object into pdf_dest"""
return cast("NumberObject", self._reference_clone(NumberObject(self), pdf_dest))
def as_numeric(self) -> int:
return int(repr(self).encode("utf8"))
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(repr(self).encode("utf8"))
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
@staticmethod
def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
num = read_until_regex(stream, NumberObject.NumberPattern)
if num.find(b".") != -1:
return FloatObject(num)
return NumberObject(num)
@staticmethod
def readFromStream(
stream: StreamType,
) -> Union["NumberObject", "FloatObject"]: # pragma: no cover
deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0")
return NumberObject.read_from_stream(stream)
class ByteStringObject(bytes, PdfObject):
"""
Represents a string object where the text encoding could not be determined.
This occurs quite often, as the PDF spec doesn't provide an alternate way to
represent strings -- for example, the encryption data stored in files (like
/O) is clearly not text, but is still stored in a "String" object.
"""
def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "ByteStringObject":
"""clone object into pdf_dest"""
return cast(
"ByteStringObject",
self._reference_clone(ByteStringObject(bytes(self)), pdf_dest),
)
@property
def original_bytes(self) -> bytes:
"""For compatibility with TextStringObject.original_bytes."""
return self
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
bytearr = self
if encryption_key:
from .._security import RC4_encrypt
bytearr = RC4_encrypt(encryption_key, bytearr) # type: ignore
stream.write(b"<")
stream.write(hexencode(bytearr))
stream.write(b">")
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
class TextStringObject(str, PdfObject):
"""
Represents a string object that has been decoded into a real unicode string.
If read from a PDF document, this string appeared to match the
PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
occur.
"""
def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "TextStringObject":
"""clone object into pdf_dest"""
obj = TextStringObject(self)
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
obj.autodetect_utf16 = self.autodetect_utf16
return cast("TextStringObject", self._reference_clone(obj, pdf_dest))
autodetect_pdfdocencoding = False
autodetect_utf16 = False
@property
def original_bytes(self) -> bytes:
"""
It is occasionally possible that a text string object gets created where
a byte string object was expected due to the autodetection mechanism --
if that occurs, this "original_bytes" property can be used to
back-calculate what the original encoded bytes were.
"""
return self.get_original_bytes()
def get_original_bytes(self) -> bytes:
# We're a text string object, but the library is trying to get our raw
# bytes. This can happen if we auto-detected this string as text, but
# we were wrong. It's pretty common. Return the original bytes that
# would have been used to create this object, based upon the autodetect
# method.
if self.autodetect_utf16:
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
elif self.autodetect_pdfdocencoding:
return encode_pdfdocencoding(self)
else:
raise Exception("no information about original bytes")
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
# Try to write the string out as a PDFDocEncoding encoded string. It's
# nicer to look at in the PDF file. Sadly, we take a performance hit
# here for trying...
try:
bytearr = encode_pdfdocencoding(self)
except UnicodeEncodeError:
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
if encryption_key:
from .._security import RC4_encrypt
bytearr = RC4_encrypt(encryption_key, bytearr)
obj = ByteStringObject(bytearr)
obj.write_to_stream(stream, None)
else:
stream.write(b"(")
for c in bytearr:
if not chr(c).isalnum() and c != b" ":
# This:
# stream.write(b_(rf"\{c:0>3o}"))
# gives
# https://github.com/davidhalter/parso/issues/207
stream.write(b_("\\%03o" % c))
else:
stream.write(b_(chr(c)))
stream.write(b")")
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
class NameObject(str, PdfObject):
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
surfix = b"/"
renumber_table = {
"#": b"#23",
"(": b"#28",
")": b"#29",
"/": b"#2F",
**{chr(i): f"#{i:02X}".encode() for i in range(33)},
}
def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> "NameObject":
"""clone object into pdf_dest"""
return cast("NameObject", self._reference_clone(NameObject(self), pdf_dest))
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(self.renumber()) # b_(renumber(self)))
def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
self.write_to_stream(stream, encryption_key)
def renumber(self) -> bytes:
out = self[0].encode("utf-8")
if out != b"/":
logger_warning(f"Incorrect first char in NameObject:({self})", __name__)
for c in self[1:]:
if c > "~":
for x in c.encode("utf-8"):
out += f"#{x:02X}".encode()
else:
try:
out += self.renumber_table[c]
except KeyError:
out += c.encode("utf-8")
return out
@staticmethod
def unnumber(sin: bytes) -> bytes:
i = sin.find(b"#", 0)
while i >= 0:
try:
sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
i = sin.find(b"#", i + 1)
except ValueError:
# if the 2 characters after # can not be converted to hexa
# we change nothing and carry on
i = i + 1
return sin
@staticmethod
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
if name != NameObject.surfix:
raise PdfReadError("name read error")
name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True)
try:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
name = NameObject.unnumber(name)
for enc in ("utf-8", "gbk"):
try:
ret = name.decode(enc)
return NameObject(ret)
except Exception:
pass
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
except (UnicodeEncodeError, UnicodeDecodeError) as e:
if not pdf.strict:
logger_warning(
f"Illegal character in Name Object ({repr(name)})", __name__
)
return NameObject(name.decode("charmap"))
else:
raise PdfReadError(
f"Illegal character in Name Object ({repr(name)})"
) from e
@staticmethod
def readFromStream(
stream: StreamType, pdf: Any # PdfReader
) -> "NameObject": # pragma: no cover
deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0")
return NameObject.read_from_stream(stream, pdf)
def encode_pdfdocencoding(unicode_string: str) -> bytes:
retval = b""
for c in unicode_string:
try:
retval += b_(chr(_pdfdoc_encoding_rev[c]))
except KeyError:
raise UnicodeEncodeError(
"pdfdocencoding", c, -1, -1, "does not exist in translation table"
)
return retval
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,129 @@
from typing import Any, Optional, Tuple, Union
class Fit:
def __init__(
self, fit_type: str, fit_args: Tuple[Union[None, float, Any], ...] = tuple()
):
from ._base import FloatObject, NameObject, NullObject
self.fit_type = NameObject(fit_type)
self.fit_args = [
NullObject() if a is None or isinstance(a, NullObject) else FloatObject(a)
for a in fit_args
]
@classmethod
def xyz(
cls,
left: Optional[float] = None,
top: Optional[float] = None,
zoom: Optional[float] = None,
) -> "Fit":
"""
Display the page designated by page, with the coordinates ( left , top )
positioned at the upper-left corner of the window and the contents
of the page magnified by the factor zoom.
A null value for any of the parameters left, top, or zoom specifies
that the current value of that parameter is to be retained unchanged.
A zoom value of 0 has the same meaning as a null value.
"""
return Fit(fit_type="/XYZ", fit_args=(left, top, zoom))
@classmethod
def fit(cls) -> "Fit":
"""
Display the page designated by page, with its contents magnified just
enough to fit the entire page within the window both horizontally and
vertically. If the required horizontal and vertical magnification
factors are different, use the smaller of the two, centering the page
within the window in the other dimension.
"""
return Fit(fit_type="/Fit")
@classmethod
def fit_horizontally(cls, top: Optional[float] = None) -> "Fit":
"""
Display the page designated by page , with the vertical coordinate top
positioned at the top edge of the window and the contents of the page
magnified just enough to fit the entire width of the page within the
window.
A null value for `top` specifies that the current value of that
parameter is to be retained unchanged.
"""
return Fit(fit_type="/FitH", fit_args=(top,))
@classmethod
def fit_vertically(cls, left: Optional[float] = None) -> "Fit":
return Fit(fit_type="/FitV", fit_args=(left,))
@classmethod
def fit_rectangle(
cls,
left: Optional[float] = None,
bottom: Optional[float] = None,
right: Optional[float] = None,
top: Optional[float] = None,
) -> "Fit":
"""
Display the page designated by page , with its contents magnified
just enough to fit the rectangle specified by the coordinates
left , bottom , right , and top entirely within the window
both horizontally and vertically.
If the required horizontal and vertical magnification factors are
different, use the smaller of the two, centering the rectangle within
the window in the other dimension.
A null value for any of the parameters may result in unpredictable
behavior.
"""
return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top))
@classmethod
def fit_box(cls) -> "Fit":
"""
Display the page designated by page , with its contents magnified
just enough to fit its bounding box entirely within the window both
horizontally and vertically. If the required horizontal and vertical
magnification factors are different, use the smaller of the two,
centering the bounding box within the window in the other dimension.
"""
return Fit(fit_type="/FitB")
@classmethod
def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit":
"""
Display the page designated by page , with the vertical coordinate
top positioned at the top edge of the window and the contents of the
page magnified just enough to fit the entire width of its bounding box
within the window.
A null value for top specifies that the current value of that parameter
is to be retained unchanged.
"""
return Fit(fit_type="/FitBH", fit_args=(top,))
@classmethod
def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit":
"""
Display the page designated by page , with the horizontal coordinate
left positioned at the left edge of the window and the contents of
the page magnified just enough to fit the entire height of its
bounding box within the window.
A null value for left specifies that the current value of that
parameter is to be retained unchanged.
"""
return Fit(fit_type="/FitBV", fit_args=(left,))
def __str__(self) -> str:
if not self.fit_args:
return f"Fit({self.fit_type})"
return f"Fit({self.fit_type}, {self.fit_args})"
DEFAULT_FIT = Fit.fit()
@@ -0,0 +1,35 @@
from typing import Any, Union
from .._utils import StreamType, deprecation_with_replacement
from ._base import NameObject
from ._data_structures import Destination
class OutlineItem(Destination):
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(b"<<\n")
for key in [
NameObject(x)
for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"]
if x in self
]:
key.write_to_stream(stream, encryption_key)
stream.write(b" ")
value = self.raw_get(key)
value.write_to_stream(stream, encryption_key)
stream.write(b"\n")
key = NameObject("/Dest")
key.write_to_stream(stream, encryption_key)
stream.write(b" ")
value = self.dest_array
value.write_to_stream(stream, encryption_key)
stream.write(b"\n")
stream.write(b">>")
class Bookmark(OutlineItem): # pragma: no cover
def __init__(self, *args: Any, **kwargs: Any) -> None:
deprecation_with_replacement("Bookmark", "OutlineItem", "3.0.0")
super().__init__(*args, **kwargs)
@@ -0,0 +1,265 @@
import decimal
from typing import Any, List, Tuple, Union
from .._utils import deprecation_no_replacement, deprecation_with_replacement
from ._base import FloatObject, NumberObject
from ._data_structures import ArrayObject
class RectangleObject(ArrayObject):
"""
This class is used to represent *page boxes* in PyPDF2. These boxes include:
* :attr:`artbox <PyPDF2._page.PageObject.artbox>`
* :attr:`bleedbox <PyPDF2._page.PageObject.bleedbox>`
* :attr:`cropbox <PyPDF2._page.PageObject.cropbox>`
* :attr:`mediabox <PyPDF2._page.PageObject.mediabox>`
* :attr:`trimbox <PyPDF2._page.PageObject.trimbox>`
"""
def __init__(
self, arr: Union["RectangleObject", Tuple[float, float, float, float]]
) -> None:
# must have four points
assert len(arr) == 4
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore
def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
if not isinstance(value, (NumberObject, FloatObject)):
value = FloatObject(value)
return value
def scale(self, sx: float, sy: float) -> "RectangleObject":
return RectangleObject(
(
float(self.left) * sx,
float(self.bottom) * sy,
float(self.right) * sx,
float(self.top) * sy,
)
)
def ensureIsNumber(
self, value: Any
) -> Union[FloatObject, NumberObject]: # pragma: no cover
deprecation_no_replacement("ensureIsNumber", "3.0.0")
return self._ensure_is_number(value)
def __repr__(self) -> str:
return f"RectangleObject({repr(list(self))})"
@property
def left(self) -> FloatObject:
return self[0]
@left.setter
def left(self, f: float) -> None:
self[0] = FloatObject(f)
@property
def bottom(self) -> FloatObject:
return self[1]
@bottom.setter
def bottom(self, f: float) -> None:
self[1] = FloatObject(f)
@property
def right(self) -> FloatObject:
return self[2]
@right.setter
def right(self, f: float) -> None:
self[2] = FloatObject(f)
@property
def top(self) -> FloatObject:
return self[3]
@top.setter
def top(self, f: float) -> None:
self[3] = FloatObject(f)
def getLowerLeft_x(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getLowerLeft_x", "left", "3.0.0")
return self.left
def getLowerLeft_y(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getLowerLeft_y", "bottom", "3.0.0")
return self.bottom
def getUpperRight_x(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getUpperRight_x", "right", "3.0.0")
return self.right
def getUpperRight_y(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getUpperRight_y", "top", "3.0.0")
return self.top
def getUpperLeft_x(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getUpperLeft_x", "left", "3.0.0")
return self.left
def getUpperLeft_y(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getUpperLeft_y", "top", "3.0.0")
return self.top
def getLowerRight_x(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getLowerRight_x", "right", "3.0.0")
return self.right
def getLowerRight_y(self) -> FloatObject: # pragma: no cover
deprecation_with_replacement("getLowerRight_y", "bottom", "3.0.0")
return self.bottom
@property
def lower_left(self) -> Tuple[decimal.Decimal, decimal.Decimal]:
"""
Property to read and modify the lower left coordinate of this box
in (x,y) form.
"""
return self.left, self.bottom
@lower_left.setter
def lower_left(self, value: List[Any]) -> None:
self[0], self[1] = (self._ensure_is_number(x) for x in value)
@property
def lower_right(self) -> Tuple[decimal.Decimal, decimal.Decimal]:
"""
Property to read and modify the lower right coordinate of this box
in (x,y) form.
"""
return self.right, self.bottom
@lower_right.setter
def lower_right(self, value: List[Any]) -> None:
self[2], self[1] = (self._ensure_is_number(x) for x in value)
@property
def upper_left(self) -> Tuple[decimal.Decimal, decimal.Decimal]:
"""
Property to read and modify the upper left coordinate of this box
in (x,y) form.
"""
return self.left, self.top
@upper_left.setter
def upper_left(self, value: List[Any]) -> None:
self[0], self[3] = (self._ensure_is_number(x) for x in value)
@property
def upper_right(self) -> Tuple[decimal.Decimal, decimal.Decimal]:
"""
Property to read and modify the upper right coordinate of this box
in (x,y) form.
"""
return self.right, self.top
@upper_right.setter
def upper_right(self, value: List[Any]) -> None:
self[2], self[3] = (self._ensure_is_number(x) for x in value)
def getLowerLeft(
self,
) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("getLowerLeft", "lower_left", "3.0.0")
return self.lower_left
def getLowerRight(
self,
) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("getLowerRight", "lower_right", "3.0.0")
return self.lower_right
def getUpperLeft(
self,
) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("getUpperLeft", "upper_left", "3.0.0")
return self.upper_left
def getUpperRight(
self,
) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("getUpperRight", "upper_right", "3.0.0")
return self.upper_right
def setLowerLeft(self, value: Tuple[float, float]) -> None: # pragma: no cover
deprecation_with_replacement("setLowerLeft", "lower_left", "3.0.0")
self.lower_left = value # type: ignore
def setLowerRight(self, value: Tuple[float, float]) -> None: # pragma: no cover
deprecation_with_replacement("setLowerRight", "lower_right", "3.0.0")
self[2], self[1] = (self._ensure_is_number(x) for x in value)
def setUpperLeft(self, value: Tuple[float, float]) -> None: # pragma: no cover
deprecation_with_replacement("setUpperLeft", "upper_left", "3.0.0")
self[0], self[3] = (self._ensure_is_number(x) for x in value)
def setUpperRight(self, value: Tuple[float, float]) -> None: # pragma: no cover
deprecation_with_replacement("setUpperRight", "upper_right", "3.0.0")
self[2], self[3] = (self._ensure_is_number(x) for x in value)
@property
def width(self) -> decimal.Decimal:
return self.right - self.left
def getWidth(self) -> decimal.Decimal: # pragma: no cover
deprecation_with_replacement("getWidth", "width", "3.0.0")
return self.width
@property
def height(self) -> decimal.Decimal:
return self.top - self.bottom
def getHeight(self) -> decimal.Decimal: # pragma: no cover
deprecation_with_replacement("getHeight", "height", "3.0.0")
return self.height
@property
def lowerLeft(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("lowerLeft", "lower_left", "3.0.0")
return self.lower_left
@lowerLeft.setter
def lowerLeft(
self, value: Tuple[decimal.Decimal, decimal.Decimal]
) -> None: # pragma: no cover
deprecation_with_replacement("lowerLeft", "lower_left", "3.0.0")
self.lower_left = value
@property
def lowerRight(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("lowerRight", "lower_right", "3.0.0")
return self.lower_right
@lowerRight.setter
def lowerRight(
self, value: Tuple[decimal.Decimal, decimal.Decimal]
) -> None: # pragma: no cover
deprecation_with_replacement("lowerRight", "lower_right", "3.0.0")
self.lower_right = value
@property
def upperLeft(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("upperLeft", "upper_left", "3.0.0")
return self.upper_left
@upperLeft.setter
def upperLeft(
self, value: Tuple[decimal.Decimal, decimal.Decimal]
) -> None: # pragma: no cover
deprecation_with_replacement("upperLeft", "upper_left", "3.0.0")
self.upper_left = value
@property
def upperRight(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover
deprecation_with_replacement("upperRight", "upper_right", "3.0.0")
return self.upper_right
@upperRight.setter
def upperRight(
self, value: Tuple[decimal.Decimal, decimal.Decimal]
) -> None: # pragma: no cover
deprecation_with_replacement("upperRight", "upper_right", "3.0.0")
self.upper_right = value
@@ -0,0 +1,172 @@
import codecs
from typing import Dict, List, Tuple, Union
from .._codecs import _pdfdoc_encoding
from .._utils import StreamType, b_, logger_warning, read_non_whitespace
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
from ._base import ByteStringObject, TextStringObject
def hex_to_rgb(value: str) -> Tuple[float, float, float]:
return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
def read_hex_string_from_stream(
stream: StreamType,
forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]:
stream.read(1)
txt = ""
x = b""
while True:
tok = read_non_whitespace(stream)
if not tok:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
if tok == b">":
break
x += tok
if len(x) == 2:
txt += chr(int(x, base=16))
x = b""
if len(x) == 1:
x += b"0"
if len(x) == 2:
txt += chr(int(x, base=16))
return create_string_object(b_(txt), forced_encoding)
def read_string_from_stream(
stream: StreamType,
forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]:
tok = stream.read(1)
parens = 1
txt = []
while True:
tok = stream.read(1)
if not tok:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
if tok == b"(":
parens += 1
elif tok == b")":
parens -= 1
if parens == 0:
break
elif tok == b"\\":
tok = stream.read(1)
escape_dict = {
b"n": b"\n",
b"r": b"\r",
b"t": b"\t",
b"b": b"\b",
b"f": b"\f",
b"c": rb"\c",
b"(": b"(",
b")": b")",
b"/": b"/",
b"\\": b"\\",
b" ": b" ",
b"%": b"%",
b"<": b"<",
b">": b">",
b"[": b"[",
b"]": b"]",
b"#": b"#",
b"_": b"_",
b"&": b"&",
b"$": b"$",
}
try:
tok = escape_dict[tok]
except KeyError:
if b"0" <= tok and tok <= b"7":
# "The number ddd may consist of one, two, or three
# octal digits; high-order overflow shall be ignored.
# Three octal digits shall be used, with leading zeros
# as needed, if the next character of the string is also
# a digit." (PDF reference 7.3.4.2, p 16)
for _ in range(2):
ntok = stream.read(1)
if b"0" <= ntok and ntok <= b"7":
tok += ntok
else:
stream.seek(-1, 1) # ntok has to be analysed
break
tok = b_(chr(int(tok, base=8)))
elif tok in b"\n\r":
# This case is hit when a backslash followed by a line
# break occurs. If it's a multi-char EOL, consume the
# second character:
tok = stream.read(1)
if tok not in b"\n\r":
stream.seek(-1, 1)
# Then don't add anything to the actual string, since this
# line break was escaped:
tok = b""
else:
msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
logger_warning(msg, __name__)
txt.append(tok)
return create_string_object(b"".join(txt), forced_encoding)
def create_string_object(
string: Union[str, bytes],
forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union[TextStringObject, ByteStringObject]:
"""
Create a ByteStringObject or a TextStringObject from a string to represent the string.
:param Union[str, bytes] string: A string
:raises TypeError: If string is not of type str or bytes.
"""
if isinstance(string, str):
return TextStringObject(string)
elif isinstance(string, bytes):
if isinstance(forced_encoding, (list, dict)):
out = ""
for x in string:
try:
out += forced_encoding[x]
except Exception:
out += bytes((x,)).decode("charmap")
return TextStringObject(out)
elif isinstance(forced_encoding, str):
if forced_encoding == "bytes":
return ByteStringObject(string)
return TextStringObject(string.decode(forced_encoding))
else:
try:
if string.startswith(codecs.BOM_UTF16_BE):
retval = TextStringObject(string.decode("utf-16"))
retval.autodetect_utf16 = True
return retval
else:
# This is probably a big performance hit here, but we need to
# convert string objects into the text/unicode-aware version if
# possible... and the only way to check if that's possible is
# to try. Some strings are strings, some are just byte arrays.
retval = TextStringObject(decode_pdfdocencoding(string))
retval.autodetect_pdfdocencoding = True
return retval
except UnicodeDecodeError:
return ByteStringObject(string)
else:
raise TypeError("create_string_object should have str or unicode arg")
def decode_pdfdocencoding(byte_array: bytes) -> str:
retval = ""
for b in byte_array:
c = _pdfdoc_encoding[b]
if c == "\u0000":
raise UnicodeDecodeError(
"pdfdocencoding",
bytearray(b),
-1,
-1,
"does not exist in translation table",
)
retval += c
return retval