Update matching logic: AI scores all candidates, lower threshold, absolute amount, prompt improvements
This commit is contained in:
@@ -0,0 +1,525 @@
|
||||
"""
|
||||
Anything related to XMP metadata.
|
||||
|
||||
See https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import decimal
|
||||
import re
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from xml.dom.minidom import Document
|
||||
from xml.dom.minidom import Element as XmlElement
|
||||
from xml.dom.minidom import parseString
|
||||
from xml.parsers.expat import ExpatError
|
||||
|
||||
from ._utils import (
|
||||
StreamType,
|
||||
deprecate_with_replacement,
|
||||
deprecation_with_replacement,
|
||||
)
|
||||
from .errors import PdfReadError
|
||||
from .generic import ContentStream, PdfObject
|
||||
|
||||
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
|
||||
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
|
||||
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
|
||||
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
|
||||
|
||||
# What is the PDFX namespace, you might ask? I might ask that too. It's
|
||||
# a completely undocumented namespace used to place "custom metadata"
|
||||
# properties, which are arbitrary metadata properties with no semantic or
|
||||
# documented meaning. Elements in the namespace are key/value-style storage,
|
||||
# where the element name is the key and the content is the value. The keys
|
||||
# are transformed into valid XML identifiers by substituting an invalid
|
||||
# identifier character with \u2182 followed by the unicode hex ID of the
|
||||
# original character. A key like "my car" is therefore "my\u21820020car".
|
||||
#
|
||||
# \u2182, in case you're wondering, is the unicode character
|
||||
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
|
||||
# escaping characters.
|
||||
#
|
||||
# Intentional users of the pdfx namespace should be shot on sight. A
|
||||
# custom data schema and sensical XML elements could be used instead, as is
|
||||
# suggested by Adobe's own documentation on XMP (under "Extensibility of
|
||||
# Schemas").
|
||||
#
|
||||
# Information presented here on the /pdfx/ schema is a result of limited
|
||||
# reverse engineering, and does not constitute a full specification.
|
||||
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
|
||||
|
||||
iso8601 = re.compile(
|
||||
"""
|
||||
(?P<year>[0-9]{4})
|
||||
(-
|
||||
(?P<month>[0-9]{2})
|
||||
(-
|
||||
(?P<day>[0-9]+)
|
||||
(T
|
||||
(?P<hour>[0-9]{2}):
|
||||
(?P<minute>[0-9]{2})
|
||||
(:(?P<second>[0-9]{2}(.[0-9]+)?))?
|
||||
(?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
|
||||
)?
|
||||
)?
|
||||
)?
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
K = TypeVar("K")
|
||||
|
||||
|
||||
def _identity(value: K) -> K:
|
||||
return value
|
||||
|
||||
|
||||
def _converter_date(value: str) -> datetime.datetime:
|
||||
matches = iso8601.match(value)
|
||||
if matches is None:
|
||||
raise ValueError(f"Invalid date format: {value}")
|
||||
year = int(matches.group("year"))
|
||||
month = int(matches.group("month") or "1")
|
||||
day = int(matches.group("day") or "1")
|
||||
hour = int(matches.group("hour") or "0")
|
||||
minute = int(matches.group("minute") or "0")
|
||||
second = decimal.Decimal(matches.group("second") or "0")
|
||||
seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
|
||||
milliseconds_dec = (second - seconds_dec) * 1000000
|
||||
|
||||
seconds = int(seconds_dec)
|
||||
milliseconds = int(milliseconds_dec)
|
||||
|
||||
tzd = matches.group("tzd") or "Z"
|
||||
dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
|
||||
if tzd != "Z":
|
||||
tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
|
||||
tzd_hours *= -1
|
||||
if tzd_hours < 0:
|
||||
tzd_minutes *= -1
|
||||
dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
|
||||
return dt
|
||||
|
||||
|
||||
def _getter_bag(
|
||||
namespace: str, name: str
|
||||
) -> Callable[["XmpInformation"], Optional[List[str]]]:
|
||||
def get(self: "XmpInformation") -> Optional[List[str]]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
retval = []
|
||||
for element in self.get_element("", namespace, name):
|
||||
bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
|
||||
if len(bags):
|
||||
for bag in bags:
|
||||
for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||
value = self._get_text(item)
|
||||
retval.append(value)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = retval
|
||||
return retval
|
||||
|
||||
return get
|
||||
|
||||
|
||||
def _getter_seq(
|
||||
namespace: str, name: str, converter: Callable[[Any], Any] = _identity
|
||||
) -> Callable[["XmpInformation"], Optional[List[Any]]]:
|
||||
def get(self: "XmpInformation") -> Optional[List[Any]]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
retval = []
|
||||
for element in self.get_element("", namespace, name):
|
||||
seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
|
||||
if len(seqs):
|
||||
for seq in seqs:
|
||||
for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||
value = self._get_text(item)
|
||||
value = converter(value)
|
||||
retval.append(value)
|
||||
else:
|
||||
value = converter(self._get_text(element))
|
||||
retval.append(value)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = retval
|
||||
return retval
|
||||
|
||||
return get
|
||||
|
||||
|
||||
def _getter_langalt(
|
||||
namespace: str, name: str
|
||||
) -> Callable[["XmpInformation"], Optional[Dict[Any, Any]]]:
|
||||
def get(self: "XmpInformation") -> Optional[Dict[Any, Any]]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
retval = {}
|
||||
for element in self.get_element("", namespace, name):
|
||||
alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
|
||||
if len(alts):
|
||||
for alt in alts:
|
||||
for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||
value = self._get_text(item)
|
||||
retval[item.getAttribute("xml:lang")] = value
|
||||
else:
|
||||
retval["x-default"] = self._get_text(element)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = retval
|
||||
return retval
|
||||
|
||||
return get
|
||||
|
||||
|
||||
def _getter_single(
|
||||
namespace: str, name: str, converter: Callable[[str], Any] = _identity
|
||||
) -> Callable[["XmpInformation"], Optional[Any]]:
|
||||
def get(self: "XmpInformation") -> Optional[Any]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
value = None
|
||||
for element in self.get_element("", namespace, name):
|
||||
if element.nodeType == element.ATTRIBUTE_NODE:
|
||||
value = element.nodeValue
|
||||
else:
|
||||
value = self._get_text(element)
|
||||
break
|
||||
if value is not None:
|
||||
value = converter(value)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = value
|
||||
return value
|
||||
|
||||
return get
|
||||
|
||||
|
||||
class XmpInformation(PdfObject):
|
||||
"""
|
||||
An object that represents Adobe XMP metadata.
|
||||
Usually accessed by :py:attr:`xmp_metadata()<PyPDF2.PdfReader.xmp_metadata>`
|
||||
|
||||
:raises PdfReadError: if XML is invalid
|
||||
"""
|
||||
|
||||
def __init__(self, stream: ContentStream) -> None:
|
||||
self.stream = stream
|
||||
try:
|
||||
data = self.stream.get_data()
|
||||
doc_root: Document = parseString(data)
|
||||
except ExpatError as e:
|
||||
raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
|
||||
self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
|
||||
RDF_NAMESPACE, "RDF"
|
||||
)[0]
|
||||
self.cache: Dict[Any, Any] = {}
|
||||
|
||||
@property
|
||||
def rdfRoot(self) -> XmlElement: # pragma: no cover
|
||||
deprecate_with_replacement("rdfRoot", "rdf_root", "4.0.0")
|
||||
return self.rdf_root
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes]
|
||||
) -> None:
|
||||
self.stream.write_to_stream(stream, encryption_key)
|
||||
|
||||
def writeToStream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes]
|
||||
) -> None: # pragma: no cover
|
||||
"""
|
||||
.. deprecated:: 1.28.0
|
||||
|
||||
Use :meth:`write_to_stream` instead.
|
||||
"""
|
||||
deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0")
|
||||
self.write_to_stream(stream, encryption_key)
|
||||
|
||||
def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
|
||||
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||
attr = desc.getAttributeNodeNS(namespace, name)
|
||||
if attr is not None:
|
||||
yield attr
|
||||
yield from desc.getElementsByTagNameNS(namespace, name)
|
||||
|
||||
def getElement(
|
||||
self, aboutUri: str, namespace: str, name: str
|
||||
) -> Iterator[Any]: # pragma: no cover
|
||||
"""
|
||||
.. deprecated:: 1.28.0
|
||||
|
||||
Use :meth:`get_element` instead.
|
||||
"""
|
||||
deprecation_with_replacement("getElement", "get_element", "3.0.0")
|
||||
return self.get_element(aboutUri, namespace, name)
|
||||
|
||||
def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
|
||||
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||
for i in range(desc.attributes.length):
|
||||
attr = desc.attributes.item(i)
|
||||
if attr.namespaceURI == namespace:
|
||||
yield attr
|
||||
for child in desc.childNodes:
|
||||
if child.namespaceURI == namespace:
|
||||
yield child
|
||||
|
||||
def getNodesInNamespace(
|
||||
self, aboutUri: str, namespace: str
|
||||
) -> Iterator[Any]: # pragma: no cover
|
||||
"""
|
||||
.. deprecated:: 1.28.0
|
||||
|
||||
Use :meth:`get_nodes_in_namespace` instead.
|
||||
"""
|
||||
deprecation_with_replacement(
|
||||
"getNodesInNamespace", "get_nodes_in_namespace", "3.0.0"
|
||||
)
|
||||
return self.get_nodes_in_namespace(aboutUri, namespace)
|
||||
|
||||
def _get_text(self, element: XmlElement) -> str:
|
||||
text = ""
|
||||
for child in element.childNodes:
|
||||
if child.nodeType == child.TEXT_NODE:
|
||||
text += child.data
|
||||
return text
|
||||
|
||||
dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor"))
|
||||
"""
|
||||
Contributors to the resource (other than the authors). An unsorted
|
||||
array of names.
|
||||
"""
|
||||
|
||||
dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage"))
|
||||
"""
|
||||
Text describing the extent or scope of the resource.
|
||||
"""
|
||||
|
||||
dc_creator = property(_getter_seq(DC_NAMESPACE, "creator"))
|
||||
"""
|
||||
A sorted array of names of the authors of the resource, listed in order
|
||||
of precedence.
|
||||
"""
|
||||
|
||||
dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
|
||||
"""
|
||||
A sorted array of dates (datetime.datetime instances) of significance to
|
||||
the resource. The dates and times are in UTC.
|
||||
"""
|
||||
|
||||
dc_description = property(_getter_langalt(DC_NAMESPACE, "description"))
|
||||
"""
|
||||
A language-keyed dictionary of textual descriptions of the content of the
|
||||
resource.
|
||||
"""
|
||||
|
||||
dc_format = property(_getter_single(DC_NAMESPACE, "format"))
|
||||
"""
|
||||
The mime-type of the resource.
|
||||
"""
|
||||
|
||||
dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier"))
|
||||
"""
|
||||
Unique identifier of the resource.
|
||||
"""
|
||||
|
||||
dc_language = property(_getter_bag(DC_NAMESPACE, "language"))
|
||||
"""
|
||||
An unordered array specifying the languages used in the resource.
|
||||
"""
|
||||
|
||||
dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher"))
|
||||
"""
|
||||
An unordered array of publisher names.
|
||||
"""
|
||||
|
||||
dc_relation = property(_getter_bag(DC_NAMESPACE, "relation"))
|
||||
"""
|
||||
An unordered array of text descriptions of relationships to other
|
||||
documents.
|
||||
"""
|
||||
|
||||
dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights"))
|
||||
"""
|
||||
A language-keyed dictionary of textual descriptions of the rights the
|
||||
user has to this resource.
|
||||
"""
|
||||
|
||||
dc_source = property(_getter_single(DC_NAMESPACE, "source"))
|
||||
"""
|
||||
Unique identifier of the work from which this resource was derived.
|
||||
"""
|
||||
|
||||
dc_subject = property(_getter_bag(DC_NAMESPACE, "subject"))
|
||||
"""
|
||||
An unordered array of descriptive phrases or keywrods that specify the
|
||||
topic of the content of the resource.
|
||||
"""
|
||||
|
||||
dc_title = property(_getter_langalt(DC_NAMESPACE, "title"))
|
||||
"""
|
||||
A language-keyed dictionary of the title of the resource.
|
||||
"""
|
||||
|
||||
dc_type = property(_getter_bag(DC_NAMESPACE, "type"))
|
||||
"""
|
||||
An unordered array of textual descriptions of the document type.
|
||||
"""
|
||||
|
||||
pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords"))
|
||||
"""
|
||||
An unformatted text string representing document keywords.
|
||||
"""
|
||||
|
||||
pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion"))
|
||||
"""
|
||||
The PDF file version, for example 1.0, 1.3.
|
||||
"""
|
||||
|
||||
pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer"))
|
||||
"""
|
||||
The name of the tool that created the PDF document.
|
||||
"""
|
||||
|
||||
xmp_create_date = property(
|
||||
_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)
|
||||
)
|
||||
"""
|
||||
The date and time the resource was originally created. The date and
|
||||
time are returned as a UTC datetime.datetime object.
|
||||
"""
|
||||
|
||||
@property
|
||||
def xmp_createDate(self) -> datetime.datetime: # pragma: no cover
|
||||
deprecate_with_replacement("xmp_createDate", "xmp_create_date", "4.0.0")
|
||||
return self.xmp_create_date
|
||||
|
||||
@xmp_createDate.setter
|
||||
def xmp_createDate(self, value: datetime.datetime) -> None: # pragma: no cover
|
||||
deprecate_with_replacement("xmp_createDate", "xmp_create_date", "4.0.0")
|
||||
self.xmp_create_date = value
|
||||
|
||||
xmp_modify_date = property(
|
||||
_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)
|
||||
)
|
||||
"""
|
||||
The date and time the resource was last modified. The date and time
|
||||
are returned as a UTC datetime.datetime object.
|
||||
"""
|
||||
|
||||
@property
|
||||
def xmp_modifyDate(self) -> datetime.datetime: # pragma: no cover
|
||||
deprecate_with_replacement("xmp_modifyDate", "xmp_modify_date", "4.0.0")
|
||||
return self.xmp_modify_date
|
||||
|
||||
@xmp_modifyDate.setter
|
||||
def xmp_modifyDate(self, value: datetime.datetime) -> None: # pragma: no cover
|
||||
deprecate_with_replacement("xmp_modifyDate", "xmp_modify_date", "4.0.0")
|
||||
self.xmp_modify_date = value
|
||||
|
||||
xmp_metadata_date = property(
|
||||
_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)
|
||||
)
|
||||
"""
|
||||
The date and time that any metadata for this resource was last changed.
|
||||
|
||||
The date and time are returned as a UTC datetime.datetime object.
|
||||
"""
|
||||
|
||||
@property
|
||||
def xmp_metadataDate(self) -> datetime.datetime: # pragma: no cover
|
||||
deprecate_with_replacement("xmp_metadataDate", "xmp_metadata_date", "4.0.0")
|
||||
return self.xmp_metadata_date
|
||||
|
||||
@xmp_metadataDate.setter
|
||||
def xmp_metadataDate(self, value: datetime.datetime) -> None: # pragma: no cover
|
||||
deprecate_with_replacement("xmp_metadataDate", "xmp_metadata_date", "4.0.0")
|
||||
self.xmp_metadata_date = value
|
||||
|
||||
xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool"))
|
||||
"""The name of the first known tool used to create the resource."""
|
||||
|
||||
@property
|
||||
def xmp_creatorTool(self) -> str: # pragma: no cover
|
||||
deprecation_with_replacement("xmp_creatorTool", "xmp_creator_tool", "3.0.0")
|
||||
return self.xmp_creator_tool
|
||||
|
||||
@xmp_creatorTool.setter
|
||||
def xmp_creatorTool(self, value: str) -> None: # pragma: no cover
|
||||
deprecation_with_replacement("xmp_creatorTool", "xmp_creator_tool", "3.0.0")
|
||||
self.xmp_creator_tool = value
|
||||
|
||||
xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID"))
|
||||
"""
|
||||
The common identifier for all versions and renditions of this resource.
|
||||
"""
|
||||
|
||||
@property
|
||||
def xmpmm_documentId(self) -> str: # pragma: no cover
|
||||
deprecation_with_replacement("xmpmm_documentId", "xmpmm_document_id", "3.0.0")
|
||||
return self.xmpmm_document_id
|
||||
|
||||
@xmpmm_documentId.setter
|
||||
def xmpmm_documentId(self, value: str) -> None: # pragma: no cover
|
||||
deprecation_with_replacement("xmpmm_documentId", "xmpmm_document_id", "3.0.0")
|
||||
self.xmpmm_document_id = value
|
||||
|
||||
xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID"))
|
||||
"""
|
||||
An identifier for a specific incarnation of a document, updated each
|
||||
time a file is saved.
|
||||
"""
|
||||
|
||||
@property
|
||||
def xmpmm_instanceId(self) -> str: # pragma: no cover
|
||||
deprecation_with_replacement("xmpmm_instanceId", "xmpmm_instance_id", "3.0.0")
|
||||
return cast(str, self.xmpmm_instance_id)
|
||||
|
||||
@xmpmm_instanceId.setter
|
||||
def xmpmm_instanceId(self, value: str) -> None: # pragma: no cover
|
||||
deprecation_with_replacement("xmpmm_instanceId", "xmpmm_instance_id", "3.0.0")
|
||||
self.xmpmm_instance_id = value
|
||||
|
||||
@property
|
||||
def custom_properties(self) -> Dict[Any, Any]:
|
||||
"""
|
||||
Retrieve custom metadata properties defined in the undocumented pdfx
|
||||
metadata schema.
|
||||
|
||||
:return: a dictionary of key/value items for custom metadata properties.
|
||||
"""
|
||||
if not hasattr(self, "_custom_properties"):
|
||||
self._custom_properties = {}
|
||||
for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
|
||||
key = node.localName
|
||||
while True:
|
||||
# see documentation about PDFX_NAMESPACE earlier in file
|
||||
idx = key.find("\u2182")
|
||||
if idx == -1:
|
||||
break
|
||||
key = (
|
||||
key[:idx]
|
||||
+ chr(int(key[idx + 1 : idx + 5], base=16))
|
||||
+ key[idx + 5 :]
|
||||
)
|
||||
if node.nodeType == node.ATTRIBUTE_NODE:
|
||||
value = node.nodeValue
|
||||
else:
|
||||
value = self._get_text(node)
|
||||
self._custom_properties[key] = value
|
||||
return self._custom_properties
|
||||
Reference in New Issue
Block a user