
    kh3O                        d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZ dd	lmZ  e eeej.                  j1                  d
                  Zeg dk  sg decxk  rg dk  rn n ed       ej6                  ej8                  d        G d d      Z G d de      Z G d de      Z y)zPDF to Docx Converter.    N)Pool	cpu_count)perf_counter)AnyStrIOUnion)Document   )Page)Pages.)r
      r   )r
         )r
   r      zJ1.19.0 <= PyMuPDF <= 1.23.8, or PyMuPDF>=1.23.16 is required for pdf2docx.z[%(levelname)s] %(message)s)levelformatc            	       f   e Zd ZdZ	 d$dededefdZed        Zed        Z	d	 Z
ed
        Zd%dededefdZd%dededefdZd Zd Zd&dZd ZdefdZdefdZdefdZd$dedededefdZd'deeee   f   dededefdZd%dededefdZdededefd Zed!        Z ed"        Z!ed#        Z"y)(	Convertera  The ``PDF`` to ``docx`` converter.
    
    * Read PDF file with ``PyMuPDF`` to get raw layout data page by page, including text,
      image, drawing and its properties, e.g. boundary box, font, size, image width, height.
    * Analyze layout in document level, e.g. page header, footer and margin.
    * Parse page layout to docx structure, e.g. paragraph and its properties like indentation, 
      spacing, text alignment; table and its properties like border, shading, merging. 
    * Finally, generate docx with ``python-docx``.
    Npdf_filepasswordstreamc                     || _         t        |xs d      | _        |s|st        d      |rt	        j
                  |      | _        nt	        j
                  |      | _        t               | _        y)zInitialize fitz object with given pdf file path.

        Args:
            pdf_file (str): pdf file path.
            stream   (bytes): pdf file in memory.
            password (str): Password for encrypted pdf. Default to None if not encrypted.
         z(Either pdf_file or stream must be given.)r   N)	filename_pdfstrr   
ValueErrorfitzr	   	_fitz_docr   _pages)selfr   r   r   s       N/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/converter.py__init__zConverter.__init__&   s]     %HN+GHH!]]&9DN "]]84DN g    c                     | j                   S N)r   r!   s    r"   fitz_doczConverter.fitz_docA   s    #~~-r$   c                     | j                   S r&   )r    r'   s    r"   pageszConverter.pagesD   s     KK'r$   c                 8    | j                   j                          y r&   )r   closer'   s    r"   r,   zConverter.closeH   s    T^^))+r$   c                     i ddddddddddd	d
ddddddddddddddddddddddddddddd dddddddd!S )"zDefault parsing parameters.debugFocrr   ignore_page_errorTmulti_processingr   min_section_heightg      4@connected_border_toleranceg      ?max_border_widthg      @min_border_clearanceg       @float_image_ignorable_gapg      @page_margin_factor_toppage_margin_factor_bottomshape_min_dimensionmax_line_spacing_ratiog      ?line_overlap_thresholdg?line_break_width_ratioline_break_free_space_ratiog?g333333?g      ?g      @g      .@)line_separate_thresholdnew_paragraph_free_space_ratiolines_left_aligned_thresholdlines_right_aligned_thresholdlines_center_aligned_thresholdclip_image_res_ratiomin_svg_gap_dxmin_svg_gap_dy	min_svg_w	min_svg_hextract_stream_tableparse_lattice_tableparse_stream_tabledelete_end_line_hyphen r'   s    r"   default_settingszConverter.default_settingsK   s    
u 
q 
  t 
 u	 

 q 
 !t 
 )s 
 s 
 #s 
 (s 
 %s 
 (s 
 "s 
 %s 
 %s 
  %s! 
" *s# 
$ 03/3/2/2/2/2/3/2/2/2/4/3/3/4? 
  	
r$   startendr*   c                 h      | j                  |||      j                  di |j                  di |S )a2  Parse pages in three steps:
        * open PDF file with ``PyMuPDF``
        * analyze whole document, e.g. page section, header/footer and margin
        * parse specified pages, e.g. paragraph, image and table

        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes to parse. Defaults to None.
            kwargs (dict, optional): Configuration parameters. 
        rL   )
load_pagesparse_documentparse_pages)r!   rN   rO   r*   kwargss        r"   parsezConverter.parset   sD     tuc51^&$&[#!# 	#r$   c           	      0   t        j                  | j                  d             | j                  j                  rU| j
                  st        d| j                   d      | j                  j                  | j
                        st        d      t        | j                        }| j                  j                  t        |      D cg c]  }t        |d       c}       | j                  ||||      }|D ]  }d| j                  |   _         | S c c}w )a  Step 1 of converting process: open PDF file with ``PyMuPDF``, 
        especially for password encrypted file.
        
        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes to parse. Defaults to None.
        z[1/4] Opening document...zRequire password for r   zIncorrect password.Tidskip_parsingF)logginginfo_color_outputr   
needs_passr   ConversionExceptionr   authenticatelenr    resetranger   _page_indexesrY   )r!   rN   rO   r*   numipage_indexess          r"   rQ   zConverter.load_pages   s     	T''(CDE >>$$==),A$BSBSATTU*VWW^^00?)*?@@ $..!%*MQ4148MN ))%eSA 	0A*/DKKN'	0  Ns   Dc                     t        j                  | j                  d              | j                  j                  | j
                  fi | | S )zjStep 2 of converting process: analyze whole document, e.g. page section,
        header/footer and margin.z[2/4] Analyzing document...)rZ   r[   r\   r    rU   r(   )r!   rT   s     r"   rR   zConverter.parse_document   s>     	T''(EFG$--262r$   c                    t        j                  | j                  d             | j                  D cg c]  }|j                  r| }}t        |      }t        |d      D ]?  \  }}|j                  dz   }t        j                  d|||       	  |j                  di | A | S c c}w # t        $ r=}|d   s|d   rt        j                  d||       nt        d| d	|       Y d
}~d
}~ww xY w)zKStep 3 of converting process: parse pages, e.g. paragraph, image and table.z[3/4] Parsing pages...r
   rN   (%d/%d) Page %dr.   r0   z,Ignore page %d due to parsing page error: %szError when parsing page : NrL   )rZ   r[   r\   r    rY   r`   	enumeraterX   rU   	Exceptionerrorr^   )r!   rT   pager*   	num_pagesre   pides           r"   rS   zConverter.parse_pages   s    T''(@AB"&++G$T5F5FGGJ	 a0 		UGAt''A+CLL*Ay#>U

$V$			U  H  Ug62E+FMM"PRUWXY-0HRPQs.STT ZUs#   B'B'B,,	C253C--C2c                    t        j                  | j                  d             t        t	        d | j
                              }|st        d      |sh| j                  rQ| j                  dt        d         d}t        j                  j                  |      r!t        j                  |       nt        d      t               }t        |      }t        |d	      D ]K  \  }}|j                  s|j                   dz   }t        j                  d
|||       	 |j#                  |       M |j+                  |       y# t$        $ r=}	|d   s|d   rt        j&                  d||	       nt)        d| d|	       Y d}	~	d}	~	ww xY w)zStep 4 of converting process: create docx file with converted pages.
        
        Args:
            filename_or_stream (str, file-like): docx file to write.
            kwargs (dict, optional): Configuration parameters.
        z[4/4] Creating pages...c                     | j                   S r&   )	finalized)ro   s    r"   <lambda>z%Converter.make_docx.<locals>.<lambda>   s
     r$   z)No parsed pages. Please parse page first.r   z.pdfz.docxz?Please specify a docx file name or a file-like object to write.r
   ri   rj   r.   r0   z+Ignore page %d due to making page error: %szError when make page rk   N)rZ   r[   r\   listfilterr    r^   r   r`   ospathexistsremover	   rl   ru   rX   	make_docxrm   rn   MakedocxExceptionsave)
r!   filename_or_streamrT   parsed_pages	docx_filerp   re   ro   rq   rr   s
             r"   r}   zConverter.make_docx   sm    	T''(ABC F'
  %&QRR!  (,(9(9!S[L(I'J%%P"77>>"45ryyAS7T)*kll J	%	 Q7 
	PGAt>>8''A+CLL*Ay#>Py)
	P 	)*  Pg62E+FMM"OQTVWX+.CC51#,NOO YPs   D>>	F3E??Fc                     t         j                  j                  | j                        t	        | j
                        | j
                  D cg c]  }|j                  s|j                         ! c}dS c c}w )z"Store parsed pages in dict format.)filenamepage_cntr*   )ry   rz   basenamer   r`   r    ru   store)r!   ro   s     r"   r   zConverter.store   sV     (():):;DKK(26++P$P
 	
 Qs   A4A4datac           	      R   | j                   sO|j                  dd      }| j                   j                  t        |      D cg c]  }t	        |d       c}       |j                  dg       D ]2  }|j                  dd      }| j                   |   j                  |       4 yc c}w )	z"Restore pages from parsed results.r   d   TrW   r*   rX   N)r    getra   rb   r   restore)r!   r   rd   re   raw_pageidxs         r"   r   zConverter.restore   s     {{((:s+CKKeCjQtqt<QR "- 	/H,,tR(CKK$$X.	/ Rs   B$r   c                     t        |dd      5 }|j                  t        j                  | j	                         d             ddd       y# 1 sw Y   yxY w)z*Write parsed pages to specified JSON file.wzutf-8)encoding   )indentN)openwritejsondumpsr   )r!   r   fs      r"   	serializezConverter.serialize  sB    (C'2 	8aGGDJJtzz|A67	8 	8 	8s   5AAc                     t        |d      5 }t        j                  |      }ddd       | j                         y# 1 sw Y   xY w)z+Load parsed pages from specified JSON file.rN)r   r   loadr   )r!   r   r   r   s       r"   deserializezConverter.deserialize  s<    (C  	 A99Q<D	 T	  	 s	   =Are   docx_filename	debug_pdflayout_filec                    t         j                  j                  | j                        \  }}|s#t         j                  j	                  |d|       }|s t         j                  j	                  |d      }|j                  dt        j                         |d        | j                  |fd|gi| | j                  |       y)a  Parse, create and plot single page for debug purpose.
        
        Args:
            i (int): Page index to convert.
            docx_filename (str): docx filename to write to.
            debug_pdf (str): New pdf file storing layout information. Default to add prefix ``debug_``.
            layout_file (str): New json file storing parsed layout data. Default to ``layout.json``.
        debug_zlayout.jsonT)r.   	debug_docdebug_filenamer*   N)
ry   rz   splitr   joinupdater   r	   convertr   )r!   re   r   r   r   rT   rz   r   s           r"   
debug_pagezConverter.debug_page  s     t'8'89hbggll46(9L&M)277<<m+LK""mmo'
 	 	]81#88 	{#r$   c                 z   t               }t        j                  d| j                         | j                  }|j                  |       |r|d   rt        d      |d   r | j                  |||fi | n&  | j                  |||fi |j                  |fi | t        j                  dt               |z
         y)a  Convert specified PDF pages to docx file.

        Args:
            docx_filename (str, file-like, optional): docx file to write. Defaults to None.
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes. Defaults to None.
            kwargs (dict, optional): Configuration parameters. Defaults to None.
        
        Refer to :py:meth:`~pdf2docx.converter.Converter.default_settings` for detail of 
        configuration parameters.
        
        .. note::
            Change extension from ``pdf`` to ``docx`` if ``docx_file`` is None.
        
        .. note::
            * ``start`` and ``end`` is counted from zero if ``--zero_based_index=True`` (by default).
            * Start from the first page if ``start`` is omitted.
            * End with the last page if ``end`` is omitted.
        
        .. note::
            ``pages`` has a higher priority than ``start`` and ``end``. ``start`` and ``end`` works only
            if ``pages`` is omitted.

        .. note::
            Multi-processing works only for continuous pages specified by ``start`` and ``end`` only.
        zStart to convert %sr1   zPMulti-processing works for continuous pages specified by "start" and "end" only.zTerminated in %.2fs.N)
r   rZ   r[   r   rM   r   r^   _convert_with_multi_processingrU   r}   )r!   r   rN   rO   r*   rT   t0settingss           r"   r   zConverter.convert2  s    : ^*D,=,=>(( X01% 'K L L &'/D//ucVXV?JDJJuc55H5??ZQYZ+\^B->?r$   c                     | j                   }|j                  |        | j                  |||fi | g }| j                  D ]0  }|j                  s|j                   |j                  di |       2 |S )a  Extract table contents from specified PDF pages.

        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes. Defaults to None.
            kwargs (dict, optional): Configuration parameters. Defaults to None.
        
        Returns:
            list: A list of parsed table content.
        rL   )rM   r   rU   r    ru   extendextract_tables)r!   rN   rO   r*   rT   r   tablesro   s           r"   r   zConverter.extract_tablesb  sx     ((

5#u11 KK 	ND~~v}}-@T-@-@-L8-LM	N r$   c                    |d   rt        |d   t                     n	t               }d}t        |      D cg c]&  }||||| j                  | j                  || d| df( }}t               }	|	j                  | j                  |d       t        |      D ]P  }| d| d}
t        j                  j                  |
      s+| j                  |
       t        j                  |
       R  | j                  |fi | yc c}w )zParse and create pages based on page indexes with multi-processing.

        Reference:

            https://pymupdf.readthedocs.io/en/latest/faq.html#multiprocessing
        r   r*   -z.jsonr
   N)minr   rb   r   r   r   map_parse_pages_per_cpury   rz   r{   r   r|   r}   )r!   r   rN   rO   rT   cpuprefixre   vectorspoolr   s              r"   r   z(Converter._convert_with_multi_processing{  s    8>k7Jc&%y{3PYP[CH:O>? sE3(9(94=="vhas%$8: O O v**GQ7 s 	 A 1#U+H77>>(+XX&IIh		  	}//Os   +C<c                 r   | \  }}}}}}}}t        ||      }	|	j                          |xs t        |	j                        }t	        ||      }
t        |
      }t        ||z        }||z  }|t        ||k        z   }|dz   |z  t        ||z
  d      z   }t        ||z   |      }t	        ||      D cg c]  }|
|   	 }}|	j                  D ]	  }d|_         |D ]  }d|	j                  |   _           |	j                  di |j                  di |j                  |       |	j                          yc c}w )a  Render a page range of a document.
        
        Args:
            vector (list): A list containing required parameters.
                * 0  : segment number for current process                
                * 1  : count of CPUs
                * 2,3: whole pages range to process
                * 4  : pdf filename
                * 5  : password for encrypted pdf
                * 6  : configuration parameters
                * 7  : json filename storing parsed results
        r
   r   TFNrL   )r   rQ   r`   r(   rb   intr   r*   rY   rR   rS   r   r,   )vectorr   r   srr   pdf_filenamer   rT   json_filenamecvall_indexesrp   mnseg_sizeseg_fromseg_tore   rf   ro   s                       r"   r   zConverter._parse_pages_per_cpu  sL    IOES!Qh |X.
 !R[[!Aqk$	 	#Os3q5z>aC9s1S5!},X()405h0GH1AHH HH6D$d/6 	-A',BHHQK$	-	#F#[	#!	#Y}%

 Is   1D4c                     |r|D cg c]  }t        |       }}|S |xs |}t        t        |       t        |            }t        |      |   }|S c c}w )zParsing arguments.)r   slicerb   )rN   rO   r*   pdf_lenxindexesr   s          r"   rc   zConverter._page_indexes  s\     ',-!s1v-G- 	 .Cc%j#c(+AGnQ'G .s   Ac                     d|  dS )Nz[1;36mz[0mrL   )msgs    r"   r\   zConverter._color_output  s    %/uG#<<r$   )NNN)r   NNr&   )Nr   NN)#__name__
__module____qualname____doc__r   bytesr#   propertyr(   r*   r,   rM   r   rw   rU   rQ   rR   rS   r}   r   dictr   r   r   r   r   r   r   r   r   r   staticmethodr   rc   r\   rL   r$   r"   r   r      s    KO.1BG6 - -' ' , "
 "
P## #S #T #"s # $ >((+^

/4 
/8 83 $3 $c $# $X[ $6-@U36
?%; -@3 -@Y\ -@lp -@`3 c d 203 0c 0s 08 + +\ 	 	 < <r$   r   c                       e Zd Zy)r^   Nr   r   r   rL   r$   r"   r^   r^         r$   r^   c                       e Zd Zy)r~   Nr   rL   r$   r"   r~   r~     r   r$   r~   )!r   r   rZ   ry   multiprocessingr   r   timer   typingr   r   r   r   docxr	   	page.Pager   
page.Pagesr   rw   r   r   VersionBindr   v
SystemExitbasicConfigINFOr   rm   r^   r~   rL   r$   r"   <module>r      s       	 +  $ $     	S$""((-	./x<8A'i'
a
bb   
,,(*
y= y=x	) 		+ 	r$   