
    khV7                     X    d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
  G d d      Zy)	a  Extract images from PDF.

Both raster images and vector graphics are considered:

* Normal images like jpeg or png could be extracted with method ``page.get_text('rawdict')`` 
  and ``Page.get_images()``. Note the process for png images with alpha channel.
* Vector graphics are actually composed of a group of paths, represented by operators like
  ``re``, ``m``, ``l`` and ``c``. They're detected by finding the contours with ``opencv``.
    N   )
Collection)	BlockType)recursive_xy_cutinner_contoursxy_project_profilec                      e Zd Zdej                  ddfdZddej                  defdZddej                  defd	Z	ddefd
Z
dedededefdZedej                  dej                  fd       Zedej                  defd       Zedej                  fd       Zedej$                  defd       Zedej                  fd       Zy)ImagesExtractorpagereturnNc                     || _         y)zwExtract images from PDF page.
        
        Args:
            page (fitz.Page): pdf page to extract images.
        N)_page)selfr   s     Z/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/image/ImagesExtractor.py__init__zImagesExtractor.__init__   s     
    bboxzoomc                 f   | j                  | j                         || j                  j                  }n2| j                  j                  r|| j                  j                  z  }n|}|| j                  j                  z  }t        j                  ||      }| j                  j                  ||      S )a  Clip page pixmap (without text) according to ``bbox``.

        Args:
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
                Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on
                the final page.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            fitz.Pixmap: The extracted pixmap.
        )clipmatrix)_hide_page_textr   rectrotationrotation_matrixfitzMatrix
get_pixmap)r   r   r   	clip_bboxr   s        r   clip_page_to_pixmapz#ImagesExtractor.clip_page_to_pixmap   s     	TZZ(<

I ZZ  tzz999I I

/	
 T4(zz$$)F$CCr   clip_image_res_ratioc                 L    | j                  ||      }| j                  ||      S )ar  Clip page pixmap (without text) according to ``bbox`` and convert to source image.

        Args:
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of image raw dict.
        )r   r   )r    _to_raw_dict)r   r   r!   pixs       r   clip_page_to_dictz!ImagesExtractor.clip_page_to_dict?   s-     &&D7K&L  d++r   c                 X   | j                   j                  }| j                   j                  }t               }| j                   j	                  d      D ]  }t        |      }d|d<   | j                   j                  |      }| j                   j                  }|D ];  }|j                         dk  r|j                  |      s)|j                  ||f       =  d }	|j                  |	      }
g }|
D ]  }t        |      dkD  r6t        j                         }|D ]
  \  }}||z  } | j                  ||      }n_|d   \  }}|d   d	k(  r| j                  ||      }n<| j!                  ||      }| j#                  ||      }|r| j%                  ||       |d
<   |j                  |        |S )a  Extract normal images with ``Page.get_images()``.

        Args:
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of extracted and recovered image raw dict.
        
        .. note::
            ``Page.get_images()`` contains each image only once, which may less than the real count of images in a page.
        T)fullr      c                 0    | d   j                  |d         S )Nr   )
intersects)abs     r   <lambda>z0ImagesExtractor.extract_images.<locals>.<lambda>v   s    1Q4??1Q40 r          image)r   parentr   r   
get_imageslistget_image_rectscropboxget_arear+   appendgrouplenr   Rectr%   _recover_pixmapr#   _rotate_image)r   r!   docr   icitemrectsunrotated_page_bboxr   fungroupsimagesr:   r   raw_dictr$   s                   r   extract_imageszImagesExtractor.extract_imagesM   s    jj::&& \JJ))t)4 	(D:DDH JJ..t4E"&**"4"4 (==?A%x +55d;X 		4,'(	($ 1#  (	$E5zA~ IIK	$)<LT49+<9<11)=QR #1X
d& 7B;#55d<PQH
 ..sD9C  $00d;H,0,>,>sXI,N)MM(#Q(	$T r   min_svg_gap_dxmin_svg_gap_dymin_wmin_hc           	         ddl }| j                  d      }| j                  |      }|j                  ||j                        }|j                  |dd|j                        \  }	}
t        |
||      }|D cg c]  }t        |
|||       }}t        t        ||            }d}|rt        |      D ]@  \  }\  }}}}t        |||||ddf   |
||||f         }|j                  d	| |       B |D ]J  \  }}|\  }}}}|j                  |||f||fd
d       |D ]   \  }}}}|j                  |||f||fdd       " L |j                  d|       |j                  d       |S c c}w )a  Find contour of potential vector graphics.

        Args:
            min_svg_gap_dx (float): Merge svg if the horizontal gap is less than this value.
            min_svg_gap_dy (float): Merge svg if the vertical gap is less than this value.
            min_w (float): Ignore contours if the bbox width is less than this value.
            min_h (float): Ignore contours if the bbox height is less than this value.

        Returns:
            list: A list of potential svg region: (external_bbox, inner_bboxes:list).
        r   N      ?)r         )min_dxmin_dyFz
sub-image-)rP   r   r   r/   )r   r   rP   img)cv2r    _pixmap_to_cv_imagecvtColorCOLOR_BGR2GRAY	thresholdTHRESH_BINARY_INVr   r   r5   zip	enumerater   imshow	rectanglewaitKey)r   rI   rJ   rK   rL   cvpixmapsrcgray_binaryexternal_bboxesr   grouped_inner_bboxesrE   debugix0y0x1y1arrinner_bboxesu0v0u1v1s                              r   detect_svg_contoursz#ImagesExtractor.detect_svg_contours   s    	 ))s)3&&v. {{3 1 12LLsC1E1EF	6 +6.Q_` XggtvtUE Jgg c/+?@A '0'A 1##BB(RUBrE1_)=vbeRPRUl?ST		Jqc*C01 '- H"l!%BBS2r(RHiC '3 HNBBLLr2hR)QGHH IIeS!JJqM5  hs   7E)r2   c                     t         j                  j                  t        |      | j                  | j
                  | j                         dS )zStore Pixmap ``image`` to raw dict.

        Args:
            image (fitz.Pixmap): Pixmap to store.
            bbox (fitz.Rect): Boundary box the pixmap.

        Returns:
            dict: Raw dict of the pixmap.
        )typer   widthheightr2   )r   IMAGEvaluetuplerv   rw   tobytes)r2   r   s     r   r#   zImagesExtractor._to_raw_dict   s9     OO))$K[[ll]]_
 	
r   r`   r   c                    ddl }ddl}t        j                  |       }|j                  dd \  }}|dz  |dz  }}d}	|j                  ||f||	      }
|j                  |
d         }|j                  |
d         }t        ||z  ||z  z         }t        ||z  ||z  z         }|
dxx   |dz  |z
  z  cc<   |
dxx   |dz  |z
  z  cc<   |j                  ||
||f      }|j                  d	|      \  }}|j                         S )
zRotate image represented by image bytes.

        Args:
            pixmap (fitz.Pixmap): Image to rotate.
            rotation (int): Rotation angle.
        
        Return: image bytes.
        r   Nr   rN   )r   r   )r   r/   )r   r   )r/   r   z.png)rT   numpyr
   rU   shapegetRotationMatrix2Dabsint
warpAffineimencoder{   )r`   r   r_   nprS   hwri   rj   scaler   cossinWHrotated_imgrc   im_pngs                     r   r>   zImagesExtractor._rotate_image   s"    	 11&9yy!}1 Aq!tB  ''R(EB ffVD\"ffVD\" SQW%&SQW%& 	tQ"$tQ"$ mmC!Q8 KK4	6~~r   c           
      n   | j                         D cg c]	  \  }}}}| }}}}}|j                  | j                                | j                  }|D ]U  }|j	                  |      j                  dd      j                  dd      j                  dd      }|j                  ||       W yc c}}}}w )z$Hide page text before clipping page.s   BTs   BT 3 Trs   Tms   Tm 3 Trs   Tds   Td 3 TrN)get_xobjectsextendget_contentsr3   xref_streamreplaceupdate_stream)r   xrefnameinvokerr   	xref_listr?   streams           r   r   zImagesExtractor._hide_page_text!  s     >B=N=N=PQQ9tT7DTQ	Q**,- kk 	,D__T*225*E.5geZ.H.5geZ.H  dF+		, Rs   B/
r?   rA   c                    |d   }|d   }t        j                  | |      }|dkD  rt        j                  | |      }|j                  rt        j                  |d      }d}|}|j                  |j                  k(  r0|j                  |j                  k(  rt        j                  ||      }nt        j                  d|       d|d   j                         v r$t        j                  t         j                  |      }|S )a&  Restore pixmap with soft mask considered.
        
        References:

            * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList        
            * https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-handle-stencil-masks
            * https://github.com/pymupdf/PyMuPDF/issues/670

        Args:
            doc (fitz.Document): pdf document.
            item (list): image instance of ``page.get_images()``.

        Returns:
            fitz.Pixmap: Recovered pixmap with soft mask considered.
        r   r/   NzCIgnore image due to inconsistent size of color and mask pixmaps: %sCMYKr0   )	r   Pixmapalpharv   rw   loggingwarninguppercsRGB)r?   rA   xsr$   masktemps          r   r=   zImagesExtractor._recover_pixmap6  s    & GG kk#q! q5;;sA&Dyy{{3* yy$**$T[[)@kk#t, egkl
 T!W]]_$++djj#.C
r   c                     ddl }ddl}| j                         }|j                  |j	                  ||j
                        |j                        S )znConvert fitz Pixmap to opencv image.

        Args:
            pixmap (fitz.Pixmap): PyMuPDF Pixmap.
        r   N)rT   r}   r{   imdecode
frombufferuint8IMREAD_COLOR)r`   r_   r   img_bytes       r   rU   z#ImagesExtractor._pixmap_to_cv_imagef  s:     	>>#{{2==288<booNNr   )N      @)r   )__name__
__module____qualname__r   Pager   r<   floatr    r%   rH   rs   staticmethodr   r#   r   r>   r   Documentr5   r=   rU    r   r   r
   r
      s7   DII $  Dtyy  DE  DF,TYY ,% ,X% Xv4 4u 4TY 4af 4n 
4;; 
TYY 
 
& * T[[ * 3 *  * Z ,TYY , ,( ,DMM , , ,^ 	O4;; 	O 	Or   r
   )__doc__r   r   common.Collectionr   common.sharer   common.algorithmr   r   r   r
   r   r   r   <module>r      s,      * $ U U^O ^Or   