StrictDoc Documentation
strictdoc/features/html2pdf/pdf_postprocessor.py
Source file coverage
Path:
strictdoc/features/html2pdf/pdf_postprocessor.py
Lines:
159
Non-empty lines:
138
Non-empty lines covered with requirements:
138 / 138 (100.0%)
Functions:
6
Functions covered by requirements:
6 / 6 (100.0%)
1
"""
2
@relation(SDOC-SRS-51, scope=file)
3
"""
4
 
5
import ntpath
6
import os
7
from pathlib import Path
8
from tempfile import NamedTemporaryFile
9
from typing import Dict, List, Optional, Tuple
10
from urllib.parse import unquote, urlsplit
11
 
12
from pypdf import PdfWriter
13
from pypdf.generic import (
14
    DictionaryObject,
15
    NameObject,
16
    PdfObject,
17
    TextStringObject,
18
)
19
 
20
 
21
class PDFPostprocessor:
22
    @classmethod
23
    def rewrite_cross_document_links(
24
        cls,
25
        *,
26
        path_to_input_root: str,  # noqa: ARG003
27
        paths_to_print: List[Tuple[str, str]],
28
    ) -> None:
29
        html_to_pdf_map: Dict[str, str] = {
30
            urlsplit(
31
                Path(path_to_html).resolve().as_uri()
32
            ).path: os.path.abspath(path_to_pdf)
33
            for path_to_html, path_to_pdf in paths_to_print
34
        }
35
        for _, path_to_pdf in paths_to_print:
36
            cls._rewrite_cross_document_links_in_single_document(
37
                html_to_pdf_map=html_to_pdf_map,
38
                path_to_pdf=path_to_pdf,
39
            )
40
 
41
    @classmethod
42
    def _rewrite_cross_document_links_in_single_document(
43
        cls,
44
        *,
45
        html_to_pdf_map: Dict[str, str],
46
        path_to_pdf: str,
47
    ) -> None:
48
        path_to_pdf = os.path.abspath(path_to_pdf)
49
        path_to_pdf_dir = os.path.dirname(path_to_pdf)
50
 
51
        writer = PdfWriter(clone_from=path_to_pdf)
52
        modified = False
53
 
54
        for page in writer.pages:
55
            annotations = page.get("/Annots")
56
            if annotations is None:
57
                continue
58
 
59
            for annotation_reference in annotations:
60
                annotation = annotation_reference.get_object()
61
                if not isinstance(annotation, DictionaryObject):
62
                    continue
63
                if annotation.get("/Subtype") != "/Link":
64
                    continue
65
 
66
                action = annotation.get("/A")
67
                if not isinstance(action, DictionaryObject):
68
                    continue
69
                if action.get("/S") != "/URI":
70
                    continue
71
 
72
                uri = action.get("/URI")
73
                if not isinstance(uri, str):
74
                    continue
75
 
76
                rewritten_action = cls._create_pdf_gotor_action(
77
                    uri=uri,
78
                    html_to_pdf_map=html_to_pdf_map,
79
                    path_to_pdf_dir=path_to_pdf_dir,
80
                )
81
                if rewritten_action is None:
82
                    continue
83
 
84
                annotation[NameObject("/A")] = rewritten_action
85
                modified = True
86
 
87
        if not modified:
88
            return
89
 
90
        with NamedTemporaryFile(
91
            mode="wb",
92
            suffix=".pdf",
93
            dir=path_to_pdf_dir,
94
            delete=False,
95
        ) as temp_file:
96
            writer.write(temp_file)
97
            temp_file_path = temp_file.name
98
        os.replace(temp_file_path, path_to_pdf)
99
 
100
    @staticmethod
101
    def _create_pdf_gotor_action(
102
        *,
103
        uri: str,
104
        html_to_pdf_map: Dict[str, str],
105
        path_to_pdf_dir: str,
106
    ) -> Optional[DictionaryObject]:
107
        # ruff: noqa: ERA001
108
        # urlsplit() produces an object of the following kind:
109
        # SplitResult(
110
        #     scheme='file',
111
        #     netloc='',
112
        #     path='<path-to-project>/output/html2pdf/html/<project-mount-folder>/<path-to-doc>-PDF.html',
113
        #     query='',
114
        #     fragment='ANCHOR'
115
        # )
116
        parsed_uri = urlsplit(uri)
117
        if parsed_uri.scheme != "file":
118
            return None
119
 
120
        matching_pdf_abspath = html_to_pdf_map.get(parsed_uri.path)
121
        if matching_pdf_abspath is None:
122
            return None
123
 
124
        matching_pdf_relpath = PDFPostprocessor._create_relative_pdf_path(
125
            path_to_pdf=matching_pdf_abspath,
126
            start_dir=path_to_pdf_dir,
127
        )
128
 
129
        action = DictionaryObject()
130
        action[NameObject("/Type")] = NameObject("/Action")
131
        action[NameObject("/S")] = NameObject("/GoToR")
132
        action[NameObject("/F")] = TextStringObject(matching_pdf_relpath)
133
 
134
        destination_name = unquote(parsed_uri.fragment)
135
        if destination_name is not None and len(destination_name) > 0:
136
            action[NameObject("/D")] = (
137
                PDFPostprocessor._create_destination_object(destination_name)
138
            )
139
        return action
140
 
141
    @staticmethod
142
    def _create_destination_object(destination_name: str) -> PdfObject:
143
        assert len(destination_name) > 0
144
        if destination_name.startswith("/"):
145
            return NameObject(destination_name)
146
        return NameObject(f"/{destination_name}")
147
 
148
    @staticmethod
149
    def _create_relative_pdf_path(*, path_to_pdf: str, start_dir: str) -> str:
150
        path_module = (
151
            ntpath
152
            if ntpath.splitdrive(path_to_pdf)[0]
153
            or ntpath.splitdrive(start_dir)[0]
154
            else os.path
155
        )
156
        relative_path = path_module.relpath(path_to_pdf, start=start_dir)
157
        # assert is needed to satisfy the type checker.
158
        assert isinstance(relative_path, str), relative_path
159
        return relative_path.replace("\\", "/")