Update matching logic: AI scores all candidates, lower threshold, absolute amount, prompt improvements

This commit is contained in:
Iyeoluwa Akinrinola
2025-07-02 16:38:01 +01:00
commit a519c42866
10641 changed files with 3944174 additions and 0 deletions
@@ -0,0 +1,63 @@
from typing import Dict, List
from .adobe_glyphs import adobe_glyphs
from .pdfdoc import _pdfdoc_encoding
from .std import _std_encoding
from .symbol import _symbol_encoding
from .zapfding import _zapfding_encoding
def fill_from_encoding(enc: str) -> List[str]:
lst: List[str] = []
for x in range(256):
try:
lst += (bytes((x,)).decode(enc),)
except Exception:
lst += (chr(x),)
return lst
def rev_encoding(enc: List[str]) -> Dict[str, int]:
rev: Dict[str, int] = {}
for i in range(256):
char = enc[i]
if char == "\u0000":
continue
assert char not in rev, (
str(char) + " at " + str(i) + " already at " + str(rev[char])
)
rev[char] = i
return rev
_win_encoding = fill_from_encoding("cp1252")
_mac_encoding = fill_from_encoding("mac_roman")
_win_encoding_rev: Dict[str, int] = rev_encoding(_win_encoding)
_mac_encoding_rev: Dict[str, int] = rev_encoding(_mac_encoding)
_symbol_encoding_rev: Dict[str, int] = rev_encoding(_symbol_encoding)
_zapfding_encoding_rev: Dict[str, int] = rev_encoding(_zapfding_encoding)
_pdfdoc_encoding_rev: Dict[str, int] = rev_encoding(_pdfdoc_encoding)
charset_encoding: Dict[str, List[str]] = {
"/StandardCoding": _std_encoding,
"/WinAnsiEncoding": _win_encoding,
"/MacRomanEncoding": _mac_encoding,
"/PDFDocEncoding": _pdfdoc_encoding,
"/Symbol": _symbol_encoding,
"/ZapfDingbats": _zapfding_encoding,
}
__all__ = [
"adobe_glyphs",
"_std_encoding",
"_symbol_encoding",
"_zapfding_encoding",
"_pdfdoc_encoding",
"_pdfdoc_encoding_rev",
"_win_encoding",
"_mac_encoding",
"charset_encoding",
]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,264 @@
# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
# C.1 Predefined encodings sorted by character name of another PDF reference
# Some indices have '\u0000' although they should have something else:
# 22: should be '\u0017'
_pdfdoc_encoding = [
"\u0000",
"\u0001",
"\u0002",
"\u0003",
"\u0004",
"\u0005",
"\u0006",
"\u0007", # 0 - 7
"\u0008",
"\u0009",
"\u000a",
"\u000b",
"\u000c",
"\u000d",
"\u000e",
"\u000f", # 8 - 15
"\u0010",
"\u0011",
"\u0012",
"\u0013",
"\u0014",
"\u0015",
"\u0000",
"\u0017", # 16 - 23
"\u02d8",
"\u02c7",
"\u02c6",
"\u02d9",
"\u02dd",
"\u02db",
"\u02da",
"\u02dc", # 24 - 31
"\u0020",
"\u0021",
"\u0022",
"\u0023",
"\u0024",
"\u0025",
"\u0026",
"\u0027", # 32 - 39
"\u0028",
"\u0029",
"\u002a",
"\u002b",
"\u002c",
"\u002d",
"\u002e",
"\u002f", # 40 - 47
"\u0030",
"\u0031",
"\u0032",
"\u0033",
"\u0034",
"\u0035",
"\u0036",
"\u0037", # 48 - 55
"\u0038",
"\u0039",
"\u003a",
"\u003b",
"\u003c",
"\u003d",
"\u003e",
"\u003f", # 56 - 63
"\u0040",
"\u0041",
"\u0042",
"\u0043",
"\u0044",
"\u0045",
"\u0046",
"\u0047", # 64 - 71
"\u0048",
"\u0049",
"\u004a",
"\u004b",
"\u004c",
"\u004d",
"\u004e",
"\u004f", # 72 - 79
"\u0050",
"\u0051",
"\u0052",
"\u0053",
"\u0054",
"\u0055",
"\u0056",
"\u0057", # 80 - 87
"\u0058",
"\u0059",
"\u005a",
"\u005b",
"\u005c",
"\u005d",
"\u005e",
"\u005f", # 88 - 95
"\u0060",
"\u0061",
"\u0062",
"\u0063",
"\u0064",
"\u0065",
"\u0066",
"\u0067", # 96 - 103
"\u0068",
"\u0069",
"\u006a",
"\u006b",
"\u006c",
"\u006d",
"\u006e",
"\u006f", # 104 - 111
"\u0070",
"\u0071",
"\u0072",
"\u0073",
"\u0074",
"\u0075",
"\u0076",
"\u0077", # 112 - 119
"\u0078",
"\u0079",
"\u007a",
"\u007b",
"\u007c",
"\u007d",
"\u007e",
"\u0000", # 120 - 127
"\u2022",
"\u2020",
"\u2021",
"\u2026",
"\u2014",
"\u2013",
"\u0192",
"\u2044", # 128 - 135
"\u2039",
"\u203a",
"\u2212",
"\u2030",
"\u201e",
"\u201c",
"\u201d",
"\u2018", # 136 - 143
"\u2019",
"\u201a",
"\u2122",
"\ufb01",
"\ufb02",
"\u0141",
"\u0152",
"\u0160", # 144 - 151
"\u0178",
"\u017d",
"\u0131",
"\u0142",
"\u0153",
"\u0161",
"\u017e",
"\u0000", # 152 - 159
"\u20ac",
"\u00a1",
"\u00a2",
"\u00a3",
"\u00a4",
"\u00a5",
"\u00a6",
"\u00a7", # 160 - 167
"\u00a8",
"\u00a9",
"\u00aa",
"\u00ab",
"\u00ac",
"\u0000",
"\u00ae",
"\u00af", # 168 - 175
"\u00b0",
"\u00b1",
"\u00b2",
"\u00b3",
"\u00b4",
"\u00b5",
"\u00b6",
"\u00b7", # 176 - 183
"\u00b8",
"\u00b9",
"\u00ba",
"\u00bb",
"\u00bc",
"\u00bd",
"\u00be",
"\u00bf", # 184 - 191
"\u00c0",
"\u00c1",
"\u00c2",
"\u00c3",
"\u00c4",
"\u00c5",
"\u00c6",
"\u00c7", # 192 - 199
"\u00c8",
"\u00c9",
"\u00ca",
"\u00cb",
"\u00cc",
"\u00cd",
"\u00ce",
"\u00cf", # 200 - 207
"\u00d0",
"\u00d1",
"\u00d2",
"\u00d3",
"\u00d4",
"\u00d5",
"\u00d6",
"\u00d7", # 208 - 215
"\u00d8",
"\u00d9",
"\u00da",
"\u00db",
"\u00dc",
"\u00dd",
"\u00de",
"\u00df", # 216 - 223
"\u00e0",
"\u00e1",
"\u00e2",
"\u00e3",
"\u00e4",
"\u00e5",
"\u00e6",
"\u00e7", # 224 - 231
"\u00e8",
"\u00e9",
"\u00ea",
"\u00eb",
"\u00ec",
"\u00ed",
"\u00ee",
"\u00ef", # 232 - 239
"\u00f0",
"\u00f1",
"\u00f2",
"\u00f3",
"\u00f4",
"\u00f5",
"\u00f6",
"\u00f7", # 240 - 247
"\u00f8",
"\u00f9",
"\u00fa",
"\u00fb",
"\u00fc",
"\u00fd",
"\u00fe",
"\u00ff", # 248 - 255
]
assert len(_pdfdoc_encoding) == 256
@@ -0,0 +1,258 @@
_std_encoding = [
"\x00",
"\x01",
"\x02",
"\x03",
"\x04",
"\x05",
"\x06",
"\x07",
"\x08",
"\t",
"\n",
"\x0b",
"\x0c",
"\r",
"\x0e",
"\x0f",
"\x10",
"\x11",
"\x12",
"\x13",
"\x14",
"\x15",
"\x16",
"\x17",
"\x18",
"\x19",
"\x1a",
"\x1b",
"\x1c",
"\x1d",
"\x1e",
"\x1f",
" ",
"!",
'"',
"#",
"$",
"%",
"&",
"",
"(",
")",
"*",
"+",
",",
"-",
".",
"/",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
":",
";",
"<",
"=",
">",
"?",
"@",
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"[",
"\\",
"]",
"^",
"_",
"",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"{",
"|",
"}",
"~",
"\x7f",
"\x80",
"\x81",
"\x82",
"\x83",
"\x84",
"\x85",
"\x86",
"\x87",
"\x88",
"\x89",
"\x8a",
"\x8b",
"\x8c",
"\x8d",
"\x8e",
"\x8f",
"\x90",
"\x91",
"\x92",
"\x93",
"\x94",
"\x95",
"\x96",
"\x97",
"\x98",
"\x99",
"\x9a",
"\x9b",
"\x9c",
"\x9d",
"\x9e",
"\x9f",
"\xa0",
"¡",
"¢",
"£",
"",
"¥",
"ƒ",
"§",
"¤",
"'",
"",
"«",
"",
"",
"",
"",
"°",
"",
"",
"",
"·",
"µ",
"",
"",
"",
"",
"",
"»",
"",
"",
"¾",
"¿",
"À",
"`",
"´",
"ˆ",
"˜",
"¯",
"˘",
"˙",
"¨",
"É",
"˚",
"¸",
"Ì",
"˝",
"˛",
"ˇ",
"",
"Ñ",
"Ò",
"Ó",
"Ô",
"Õ",
"Ö",
"×",
"Ø",
"Ù",
"Ú",
"Û",
"Ü",
"Ý",
"Þ",
"ß",
"à",
"Æ",
"â",
"ª",
"ä",
"å",
"æ",
"ç",
"Ł",
"Ø",
"Œ",
"º",
"ì",
"í",
"î",
"ï",
"ð",
"æ",
"ò",
"ó",
"ô",
"ı",
"ö",
"÷",
"ł",
"ø",
"œ",
"ß",
"ü",
"ý",
"þ",
"ÿ",
]
@@ -0,0 +1,260 @@
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
_symbol_encoding = [
"\u0000",
"\u0001",
"\u0002",
"\u0003",
"\u0004",
"\u0005",
"\u0006",
"\u0007",
"\u0008",
"\u0009",
"\u000A",
"\u000B",
"\u000C",
"\u000D",
"\u000E",
"\u000F",
"\u0010",
"\u0011",
"\u0012",
"\u0013",
"\u0014",
"\u0015",
"\u0016",
"\u0017",
"\u0018",
"\u0019",
"\u001A",
"\u001B",
"\u001C",
"\u001D",
"\u001E",
"\u001F",
"\u0020",
"\u0021",
"\u2200",
"\u0023",
"\u2203",
"\u0025",
"\u0026",
"\u220B",
"\u0028",
"\u0029",
"\u2217",
"\u002B",
"\u002C",
"\u2212",
"\u002E",
"\u002F",
"\u0030",
"\u0031",
"\u0032",
"\u0033",
"\u0034",
"\u0035",
"\u0036",
"\u0037",
"\u0038",
"\u0039",
"\u003A",
"\u003B",
"\u003C",
"\u003D",
"\u003E",
"\u003F",
"\u2245",
"\u0391",
"\u0392",
"\u03A7",
"\u0394",
"\u0395",
"\u03A6",
"\u0393",
"\u0397",
"\u0399",
"\u03D1",
"\u039A",
"\u039B",
"\u039C",
"\u039D",
"\u039F",
"\u03A0",
"\u0398",
"\u03A1",
"\u03A3",
"\u03A4",
"\u03A5",
"\u03C2",
"\u03A9",
"\u039E",
"\u03A8",
"\u0396",
"\u005B",
"\u2234",
"\u005D",
"\u22A5",
"\u005F",
"\uF8E5",
"\u03B1",
"\u03B2",
"\u03C7",
"\u03B4",
"\u03B5",
"\u03C6",
"\u03B3",
"\u03B7",
"\u03B9",
"\u03D5",
"\u03BA",
"\u03BB",
"\u00B5",
"\u03BD",
"\u03BF",
"\u03C0",
"\u03B8",
"\u03C1",
"\u03C3",
"\u03C4",
"\u03C5",
"\u03D6",
"\u03C9",
"\u03BE",
"\u03C8",
"\u03B6",
"\u007B",
"\u007C",
"\u007D",
"\u223C",
"\u007F",
"\u0080",
"\u0081",
"\u0082",
"\u0083",
"\u0084",
"\u0085",
"\u0086",
"\u0087",
"\u0088",
"\u0089",
"\u008A",
"\u008B",
"\u008C",
"\u008D",
"\u008E",
"\u008F",
"\u0090",
"\u0091",
"\u0092",
"\u0093",
"\u0094",
"\u0095",
"\u0096",
"\u0097",
"\u0098",
"\u0099",
"\u009A",
"\u009B",
"\u009C",
"\u009D",
"\u009E",
"\u009F",
"\u20AC",
"\u03D2",
"\u2032",
"\u2264",
"\u2044",
"\u221E",
"\u0192",
"\u2663",
"\u2666",
"\u2665",
"\u2660",
"\u2194",
"\u2190",
"\u2191",
"\u2192",
"\u2193",
"\u00B0",
"\u00B1",
"\u2033",
"\u2265",
"\u00D7",
"\u221D",
"\u2202",
"\u2022",
"\u00F7",
"\u2260",
"\u2261",
"\u2248",
"\u2026",
"\uF8E6",
"\uF8E7",
"\u21B5",
"\u2135",
"\u2111",
"\u211C",
"\u2118",
"\u2297",
"\u2295",
"\u2205",
"\u2229",
"\u222A",
"\u2283",
"\u2287",
"\u2284",
"\u2282",
"\u2286",
"\u2208",
"\u2209",
"\u2220",
"\u2207",
"\uF6DA",
"\uF6D9",
"\uF6DB",
"\u220F",
"\u221A",
"\u22C5",
"\u00AC",
"\u2227",
"\u2228",
"\u21D4",
"\u21D0",
"\u21D1",
"\u21D2",
"\u21D3",
"\u25CA",
"\u2329",
"\uF8E8",
"\uF8E9",
"\uF8EA",
"\u2211",
"\uF8EB",
"\uF8EC",
"\uF8ED",
"\uF8EE",
"\uF8EF",
"\uF8F0",
"\uF8F1",
"\uF8F2",
"\uF8F3",
"\uF8F4",
"\u00F0",
"\u232A",
"\u222B",
"\u2320",
"\uF8F5",
"\u2321",
"\uF8F6",
"\uF8F7",
"\uF8F8",
"\uF8F9",
"\uF8FA",
"\uF8FB",
"\uF8FC",
"\uF8FD",
"\uF8FE",
"\u00FF",
]
assert len(_symbol_encoding) == 256
@@ -0,0 +1,261 @@
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
_zapfding_encoding = [
"\u0000",
"\u0001",
"\u0002",
"\u0003",
"\u0004",
"\u0005",
"\u0006",
"\u0007",
"\u0008",
"\u0009",
"\u000A",
"\u000B",
"\u000C",
"\u000D",
"\u000E",
"\u000F",
"\u0010",
"\u0011",
"\u0012",
"\u0013",
"\u0014",
"\u0015",
"\u0016",
"\u0017",
"\u0018",
"\u0019",
"\u001A",
"\u001B",
"\u001C",
"\u001D",
"\u001E",
"\u001F",
"\u0020",
"\u2701",
"\u2702",
"\u2703",
"\u2704",
"\u260E",
"\u2706",
"\u2707",
"\u2708",
"\u2709",
"\u261B",
"\u261E",
"\u270C",
"\u270D",
"\u270E",
"\u270F",
"\u2710",
"\u2711",
"\u2712",
"\u2713",
"\u2714",
"\u2715",
"\u2716",
"\u2717",
"\u2718",
"\u2719",
"\u271A",
"\u271B",
"\u271C",
"\u271D",
"\u271E",
"\u271F",
"\u2720",
"\u2721",
"\u2722",
"\u2723",
"\u2724",
"\u2725",
"\u2726",
"\u2727",
"\u2605",
"\u2729",
"\u272A",
"\u272B",
"\u272C",
"\u272D",
"\u272E",
"\u272F",
"\u2730",
"\u2731",
"\u2732",
"\u2733",
"\u2734",
"\u2735",
"\u2736",
"\u2737",
"\u2738",
"\u2739",
"\u273A",
"\u273B",
"\u273C",
"\u273D",
"\u273E",
"\u273F",
"\u2740",
"\u2741",
"\u2742",
"\u2743",
"\u2744",
"\u2745",
"\u2746",
"\u2747",
"\u2748",
"\u2749",
"\u274A",
"\u274B",
"\u25CF",
"\u274D",
"\u25A0",
"\u274F",
"\u2750",
"\u2751",
"\u2752",
"\u25B2",
"\u25BC",
"\u25C6",
"\u2756",
"\u25D7",
"\u2758",
"\u2759",
"\u275A",
"\u275B",
"\u275C",
"\u275D",
"\u275E",
"\u007F",
"\uF8D7",
"\uF8D8",
"\uF8D9",
"\uF8DA",
"\uF8DB",
"\uF8DC",
"\uF8DD",
"\uF8DE",
"\uF8DF",
"\uF8E0",
"\uF8E1",
"\uF8E2",
"\uF8E3",
"\uF8E4",
"\u008E",
"\u008F",
"\u0090",
"\u0091",
"\u0092",
"\u0093",
"\u0094",
"\u0095",
"\u0096",
"\u0097",
"\u0098",
"\u0099",
"\u009A",
"\u009B",
"\u009C",
"\u009D",
"\u009E",
"\u009F",
"\u00A0",
"\u2761",
"\u2762",
"\u2763",
"\u2764",
"\u2765",
"\u2766",
"\u2767",
"\u2663",
"\u2666",
"\u2665",
"\u2660",
"\u2460",
"\u2461",
"\u2462",
"\u2463",
"\u2464",
"\u2465",
"\u2466",
"\u2467",
"\u2468",
"\u2469",
"\u2776",
"\u2777",
"\u2778",
"\u2779",
"\u277A",
"\u277B",
"\u277C",
"\u277D",
"\u277E",
"\u277F",
"\u2780",
"\u2781",
"\u2782",
"\u2783",
"\u2784",
"\u2785",
"\u2786",
"\u2787",
"\u2788",
"\u2789",
"\u278A",
"\u278B",
"\u278C",
"\u278D",
"\u278E",
"\u278F",
"\u2790",
"\u2791",
"\u2792",
"\u2793",
"\u2794",
"\u2192",
"\u2194",
"\u2195",
"\u2798",
"\u2799",
"\u279A",
"\u279B",
"\u279C",
"\u279D",
"\u279E",
"\u279F",
"\u27A0",
"\u27A1",
"\u27A2",
"\u27A3",
"\u27A4",
"\u27A5",
"\u27A6",
"\u27A7",
"\u27A8",
"\u27A9",
"\u27AA",
"\u27AB",
"\u27AC",
"\u27AD",
"\u27AE",
"\u27AF",
"\u00F0",
"\u27B1",
"\u27B2",
"\u27B3",
"\u27B4",
"\u27B5",
"\u27B6",
"\u27B7",
"\u27B8",
"\u27B9",
"\u27BA",
"\u27BB",
"\u27BC",
"\u27BD",
"\u27BE",
"\u00FF",
]
assert len(_zapfding_encoding) == 256