
    kh                     ~    d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	l	m
Z
 dd
lmZmZ ddlmZ  G d de      Zy)z+
A wrapper of PyMuPDF Page as page engine.
    N   )RawPage   )ImagesExtractor)Paths)FACTOR_A_HALF)Element)RectType
debug_plot)get_areac                   J    e Zd ZdZd Zd Zd Zd Z ed      d        Z	d Z
y	)
RawPageFitzz6A wrapper of ``fitz.Page`` to extract source contents.c                    i }| j                   s|S | j                   j                  ^ }}}|j                  ||d       ||c| _        | _         | j
                  di |}||d<    | j                  di |}|d   j                  |        | j                  di |\  }}	||d<   |d   j                  |	       | j                         }
|d   j                  |
       t        j                  | j                   j                         |S )N)widthheightblocksshapes )page_enginerectupdater   r   _preprocess_text_preprocess_imagesextend_preprocess_shapes_preprocess_hyperlinksr	   set_rotation_matrixrotation_matrix)selfsettingsraw_dict_whtext_blocksimage_blocksr   images
hyperlinkss              U/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/page/RawPageFitz.pyextract_raw_dictzRawPageFitz.extract_raw_dict   s    ##((AqA45"#Q
DK ,d++7h7(.t..::!!,/1$11=H=#!!&)002
!!*- 	##D$4$4$D$DE    c           	      R   |d   }|dk(  rt        d      | j                  j                  dd      }|j                  dg       }	 | j                  j	                         }|s|S |d	k(  rd
 }nd }t        t        ||            }d }g }	|D ]r  }
d}|
d   D ]R  }|d   D ]D  }|D ]9  }t        |d   |d         }| ||d         z  t        k\  s+|d   |d   k(  s7d} n |sD n |sR n |rb|	j                  |
       t |	S # t
        $ r t        j                  d       g }Y w xY w)ak  Extract page text and identify hidden text. 
        
        NOTE: All the coordinates are relative to un-rotated page.

            https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
            https://pymupdf.readthedocs.io/en/latest/functions.html#Page.get_texttrace
            https://pymupdf.readthedocs.io/en/latest/textpage.html
        ocrr   z/OCR feature is planned but not implemented yet.rawdict@   )flagsr   zJIgnore hidden text checking due to UnicodeDecodeError in upstream library.r   c                     | d   dk7  S Ntype   r   spans    r)   <lambda>z.RawPageFitz._preprocess_text.<locals>.<lambda>N       T&\1_ r+   c                     | d   dk(  S r2   r   r5   s    r)   r7   z.RawPageFitz._preprocess_text.<locals>.<lambda>P   r8   r+   c                 &    | \  }}}}||z
  ||z
  z  S )Nr   )bboxx0y0x1y1s        r)   	span_areaz/RawPageFitz._preprocess_text.<locals>.span_areaS   s"    !NBBrEbe$$r+   Flinesspansr;   fontT)
SystemExitr   get_textgetget_texttraceSystemErrorloggingwarninglistfilterr   r   append)r   r    r-   rawr%   rB   ffiltered_spansr@   r   blockintersectedliner6   filter_spanintersected_areas                   r)   r   zRawPageFitz._preprocess_text1   sv    uo6$UVV ''	'<ggh+
	$$224E
 [( 6,A,AfQ./	%   	5EKg 	& M *D'5 "+3DL+fBU+V(+iV.EEV $Vk&.A A*.K!" #E* 	& e 4	5  I  	OOhiE	s   D  D&%D&c                 d    |d   dk(  rg S t        | j                        j                  |d         S )aD  Extract image blocks. Image block extracted by ``page.get_text('rawdict')`` doesn't 
        contain alpha channel data, so it has to get page images by ``page.get_images()`` and 
        then recover them. Note that ``Page.get_images()`` contains each image only once, i.e., 
        ignore duplicated occurrences.
        r-   r   clip_image_res_ratio)r   r   extract_images)r   r    s     r)   r   zRawPageFitz._preprocess_imagesm   s6     E?Abyt//0??I_@`aar+   c                 n     | j                   di |}|j                  |d   |d   |d   |d   |d         S )zGIdentify iso-oriented paths and convert vector graphic paths to pixmap.min_svg_gap_dxmin_svg_gap_dy	min_svg_w	min_svg_hrW   r   )_init_pathsto_shapes_and_images)r   r    pathss      r)   r   zRawPageFitz._preprocess_shapesy   sU       ,8,))%&%&[![!+,. 	.r+   zSource Pathsc                 l    | j                   j                         }t        |       j                  |      S )z:Initialize Paths based on drawings extracted with PyMuPDF.)parent)r   get_cdrawingsr   restore)r   r    	raw_pathss      r)   r^   zRawPageFitz._init_paths   s.     $$224	D!)))44r+   c                     g }| j                   j                         D ]G  }|d   dk7  r|j                  t        j                  j
                  t        |d         |d   d       I |S )ziGet source hyperlink dicts.

        Returns:
            list: A list of source hyperlink dict.
        kindr   fromuri)r3   r;   ri   )r   	get_linksrM   r
   	HYPERLINKvaluetuple)r   r(   links      r)   r   z"RawPageFitz._preprocess_hyperlinks   sq     
$$..0 	DF|Q **00d6l+U 	 r+   N)__name__
__module____qualname____doc__r*   r   r   r   r   r^   r   r   r+   r)   r   r      s9    @:9x	b. 5  5r+   r   )rr   rI   r   image.ImagesExtractorr   shape.Pathsr   common.constantsr   common.Elementr	   common.sharer
   r   common.algorithmr   r   r   r+   r)   <module>ry      s2      3  , $ 1 'I' Ir+   