
    khe                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d de      Zy)a  
A group of text elements, distinguished to ``Shape`` elements. For instance, ``TextBlock``, 
``ImageBlock`` or ``TableBlock`` after parsing, while ``Line`` instances at the beginning, 
and a combination of ``Line`` and ``TableBlock`` during parsing process.
    N)Pt   )	constants)ElementCollection)	BlockTypelower_round	rgb_value)Block)reset_paragraph_formatdelete_paragraph)	TextBlock)TextSpan)Line)Lines)Cell)
ImageBlock)
TableBlockc                   r    e Zd ZdZd)def fdZdefdZed        Z	ed        Z
ed        Zed	        Zed
        Zed        ZdefdZdedefdZdefdZdedefdZdededefdZdefdZd Zd Zd ZdefdZdefd Zeded!ed"efd#       Zdefd$Zedededefd%       Zd& Z d' Z!d( Z" xZ#S )*BlockszBlock collections.	instancesc                 4    t         |   ||       g | _        y)zC A collection of text based elements, e.g. lines, images or blocks.N)super__init___floating_image_blocks)selfr   parent	__class__s      R/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/layout/Blocks.pyr   zBlocks.__init__   s    F+&(#    blockc                      y)zOverride. The parent is ``Layout``, which is not necessary to update its bbox. 
        So, do nothing but required here.
        N )r   r    s     r   _update_bboxzBlocks._update_bbox   s     	r   c                     | j                   S N)r   r   s    r   floating_image_blockszBlocks.floating_image_blocks&   s    ***r   c                 B    t        t        d | j                              S )z6Get lattice table blocks contained in this Collection.c                 >    t        | t              xr | j                  S r%   )
isinstancer
   is_lattice_table_blockr    s    r   <lambda>z-Blocks.lattice_table_blocks.<locals>.<lambda>/   s    *UE2Su7S7S r   listfilter
_instancesr&   s    r   lattice_table_blockszBlocks.lattice_table_blocks+   s'     FSUYUdUdf g 	gr   c                 B    t        t        d | j                              S )z5Get stream table blocks contained in this Collection.c                 >    t        | t              xr | j                  S r%   )r*   r
   is_stream_table_blockr,   s    r   r-   z,Blocks.stream_table_blocks.<locals>.<lambda>6       *UE2Ru7R7R r   r.   r&   s    r   stream_table_blockszBlocks.stream_table_blocks2   '     FRTXTcTce f 	fr   c                 B    t        t        d | j                              S )z.Get table blocks contained in this Collection.c                 >    t        | t              xr | j                  S r%   )r*   r
   is_table_blockr,   s    r   r-   z%Blocks.table_blocks.<locals>.<lambda>=   s    *UE2Ku7K7K r   r.   r&   s    r   table_blockszBlocks.table_blocks9   s$     FKT__^ _ 	_r   c                 B    t        t        d | j                              S )z5Get inline image blocks contained in this Collection.c                 >    t        | t              xr | j                  S r%   )r*   r
   is_inline_image_blockr,   s    r   r-   z,Blocks.inline_image_blocks.<locals>.<lambda>D   r6   r   r.   r&   s    r   inline_image_blockszBlocks.inline_image_blocks@   r8   r   c                 B    t        t        d | j                              S )z3Get text/image blocks contained in this Collection.c                 >    t        | t              xr | j                  S r%   )r*   r
   is_text_image_blockr,   s    r   r-   z$Blocks.text_blocks.<locals>.<lambda>K   s    *UE2Pu7P7P r   r.   r&   s    r   text_blockszBlocks.text_blocksG   s'     FPRVRaRac d 	dr   rawsc                    | j                          |D ]  }|j                  dd      }|t        j                  j                  k(  rt        |      j                         }nm|t        j                  j                  k(  rt        |      }nD|t        j                  j                  t        j                  j                  fv rt        |      }nd}| j                  |        | S )a  Clean current instances and restore them from source dict.
        ImageBlock is converted to ImageSpan contained in TextBlock.

        Args:
            raws (list): A list of raw dicts representing text/image/table blocks.

        Returns:
            Blocks: self
        typeN)resetgetr   IMAGEvaluer   to_text_blockTEXTr   LATTICE_TABLESTREAM_TABLEr   append)r   rE   	raw_block
block_typer    s        r   restorezBlocks.restoreN   s     	

 	I"vr2J Y__222"9-;;= y~~333!), 	 7 7 = =y?U?U?[?[\\"9-  KK'	* r   float_image_ignorable_gapline_overlap_thresholdc                    | j                   syg }| j                   D ]4  }t        |t        t        f      s|j	                  |j
                         6 | j                  j                  fd}t        t        ||            }|D ]  }|j                           | j                  |      j                  |      j                  |       y)a  Clean up blocks in page level.

        * convert to lines
        * remove lines out of page
        * remove transformed text: text direction is not (1, 0) or (0, -1)
        * remove empty lines

        Args:
            float_image_ignorable_gap (float): Regarded as float image if the intersection exceeds this value.
            line_overlap_threshold (float): remove line if the intersection exceeds this value.

        .. note::
            The block structure extracted from ``PyMuPDF`` might be unreasonable, e.g. 
            * one real paragraph is split into multiple blocks; or
            * one block consists of multiple real paragraphs
        Nc                     | j                   j                        xr) | j                   xr | j                  xs | j                  S r%   )bbox
intersectswhite_space_onlyis_horizontal_textis_vertical_text)line	page_bboxs    r   r-   z!Blocks.clean_up.<locals>.<lambda>   sE    --i8 N 111N 33Lt7L7L r   )r1   r*   r   r   extendlinesr   working_bboxr/   r0   striprI   _identify_floating_images_remove_overlapped_lines)r   rU   rV   r   r    fr^   r_   s          @r   clean_upzBlocks.clean_upq   s    "  	__ 	*Eej)%<=xU[[)	*
 KK,,	N 9-.	 +Dtzz|+ 	

9&&'@A%%&<=r   tablesc                    |sy|D cg c]  }g  }}g }| j                   D ]  }| j                  ||||        t        ||      D ]  \  }}|s	|j                  |        |j	                  |       | j                  |       yc c}w )zAdd blocks (line or sub-table) to associated cells of given tables.

        Args:
            tables (list): A list of TableBlock instances.
        N)r1   _assign_block_to_tableszipassign_blocksr`   rI   )r   rh   _blocks_in_tablesblocksr    tableblocks_in_tables           r   assign_to_tableszBlocks.assign_to_tables   s     v )//1B//__ 	RE((8H&Q	R '*&2B&C 	1"E?"H0	1 	f

6 0s   	Bpotential_shadingsline_separate_thresholdc           	      R   | j                  | j                         }d }t        t        d |            fd}g g fd}|d   j                  j
                  }t        | j                  t              }|D ]  }	|	j                  }
|	j                  ||      s&j                  |	D cg c]
  } ||       c}       n |        |	D ]"  } ||      sj                   ||             $ |
j                  |z
  dk\  r |        |
j
                  }  |        S c c}w )	a  Collect elements in Line level (line or table bbox), which may contained in a stream table region.
        
        Table may exist on the following conditions:

        * blocks in a row don't follow flow layout; or
        * block is contained in potential shading
        
        Args:
            potential_shadings (list): a group of shapes representing potential cell shading
            line_separate_threshold (float): two separate lines if the x-distance exceeds this value
        
        Returns:
            list: A list of Lines. Each group of Lines represents a potential table.
        
        .. note::
            ``PyMuPDF`` may group multi-lines in a row as a text block while each line belongs to different
            cell. So, it's required to deep into line level.
        )text_directionc                 l    t        | t              r| S t               j                  | j                        S r%   )r*   r   update_bbox
outer_bboxr,   s    r   sub_linez-Blocks.collect_stream_lines.<locals>.sub_line   s)    &ud35]9K9KEL\L\9]]r   c                 T    | j                    xr | j                  t        g d      k7  S )N)   r|   r|   )is_determinedcolorr	   )shapes    r   r-   z-Blocks.collect_stream_lines.<locals>.<lambda>   s"    e111Uekk9WCU6U r   c                 Z    D ]%  }|j                  | t        j                        s% y y)N	thresholdTF)containsr   FACTOR_MOST)r    shadingshadings_exclude_whites     r   contained_in_shadingsz:Blocks.collect_stream_lines.<locals>.contained_in_shadings   s3    1 Y##EY5J5J#KTXYr   c                  `    sy  j                  t                     j                          y r%   )rQ   r   clear)restable_liness   r   close_tablez0Blocks.collect_stream_lines.<locals>.close_table   s%    JJu[)*r   r   )cell_layout2   )group_by_rowsis_mix_textr/   r0   rY   y1r*   r   r   is_flow_layoutr`   rQ   y0)r   rs   rt   rowsrz   r   r   ref_posr   rowrY   r    r   r   r   s               @@@r   collect_stream_lineszBlocks.collect_stream_lines   s+   , !!T5E5E1E!F	^ "&fU'  "!	 	  q',,// d3 	C88D %%&=;%W""#EHUO#EF   U(/1C1CHUO1TU www"KM ggG'	, 	
% $Fs   /D$
max_line_spacing_ratioline_break_free_space_rationew_paragraph_free_space_ratioc                     | j                          | j                  |      }| j                  |||      }| j                  |       y)zGroup lines into text block.N)sort_in_reading_order_plus_join_lines_vertically_split_text_block_verticallyrI   )r   r   r   r   ro   s        r   parse_blockzBlocks.parse_block   sK     	'') ,,-CD 226'*, 	

6r   delete_end_line_hyphenc                     t        d | j                        D ].  }|j                  |       |j                  j	                  |       0 y)zParse text format with style represented by stroke/fill shapes.
        
        Args:
            rects (Shapes): Potential styles applied on blocks.
            delete_end_line_hyphen (bool): delete hyphen at the end of a line if True.
        c                     | j                   S r%   )is_text_block)es    r   r-   z*Blocks.parse_text_format.<locals>.<lambda>  s
    aoo r   N)r0   r1   parse_text_formatra   adjust_last_word)r   rectsr   r    s       r   r   zBlocks.parse_text_format  sD     5tG 	AE##E* KK(()?@		Ar   c                 |    | j                   sy | j                  |  | j                          | j                          y)a,  Calculate external and internal space for text blocks:

        - vertical distance between blocks, i.e. paragraph before/after spacing
        - horizontal distance to left/right border, i.e. paragraph left/right indent
        - vertical distance between lines, i.e. paragraph line spacing
        N)r1   _parse_block_horizontal_spacing_parse_block_vertical_spacing_parse_line_spacing)r   argss     r   parse_spacingzBlocks.parse_spacing  s5     ,,,d3**,  "r   c                    fd}d}t        | j                  t              }| j                  D ]e  }|j                  r$j                         }|j                  |       d}3|j                  s@ |||       d}|sNt        j                  d          g | j                  ddd   D ]P  }|j                  r|j                  s yj                         }t        |t        t        j                               R y)zCreate page based on parsed block structure. 
        
        Args:
            doc (Document, _Cell): The container to make docx content.
        c                 H   | j                   t        j                  k\  s|r<t        | j                   d      }j	                         }t        |t        |             j                  | j                  | j                        }d|_
        d|_        | j                  |       y )Nr|   )line_spacing)r   colsF)before_spacer   MIN_LINE_SPACINGr   add_paragraphr   r   	add_tablenum_rowsnum_colsautofitallow_autofit	make_docx)table_block	pre_tablehprp   docs        r   
make_tablez$Blocks.make_docx.<locals>.make_table1  s     '')C)CCy 8 8!<%%'&qr!u= MM{';';+BVBVMWE!EM#(E!!%(r   FTrH   N)r*   r   r   r1   rC   r   r   r;   r   
paragraphsis_float_image_blockr   r   r   r   )r   r   r   r   r   r    r   s    `     r   r   zBlocks.make_docx+  s    	) 	 d3__ 	9E((%%'"!	 %%5), 	
 $S^^B%78%	92 __TrT* 		FE))8 '' !!#A"1b)C)C&DE		Fr   c                 H    | j                   D ]  }|j                  |        y)z*Plot blocks in PDF page for debug purpose.N)r1   plot)r   pager    s      r   r   zBlocks.plotg  s    __6Eejj&66r   c                 <   | j                  | |       }t        d |      D ]v  }t        d |      D ]d  }t               j                  |j                  d         }|j                          | j                  j                  |       |j                  d       f x | S )z8Identify floating image lines and convert to ImageBlock.)dxdyc                     t        |       dkD  S Nr|   lengroups    r   r-   z2Blocks._identify_floating_images.<locals>.<lambda>u      #e*Q, r   c                     | j                   S r%   )image_spansr^   s    r   r-   z2Blocks._identify_floating_images.<locals>.<lambda>v  s    D,<,< r   r   r   r   r   r   )	group_by_connectivityr0   r   
from_imagespansset_float_image_blockr   rQ   rx   )r   rU   groupsr   r^   float_images         r   rd   z Blocks._identify_floating_imageso  s     ++0I/IOhNh+i 6? 	,E<eD ,(l55djjmD113++22;?   +,	, r   c                     fd}| j                  |      }t        d |      D ]K  }t        |d       }|dd D ]3  }t        j                  d|j
                         |j                  d       5 M | S )	zjDelete overlapped lines. 
        NOTE: Don't run this method until floating images are excluded.
        c                 *    | j                  |      S )Nr   )get_main_bbox)abrV   s     r   r-   z1Blocks._remove_overlapped_lines.<locals>.<lambda>  s    1??18N?O r   c                     t        |       dkD  S r   r   r   s    r   r-   z1Blocks._remove_overlapped_lines.<locals>.<lambda>  r   r   c                 6    | j                   j                         S r%   )rY   get_arear   s    r   r-   z1Blocks._remove_overlapped_lines.<locals>.<lambda>  s    $)):L:L:N r   keyNrH   zIgnore Line "%s" due to overlapr   )r   r0   sortedloggingwarningtextrx   )r   rV   funr   r   sorted_linesr^   s    `     r   re   zBlocks._remove_overlapped_lines  s|    
 PC 6? 	,E!%-NOL$Sb) , A499M  +,	, r   rn   ro   c                    t        ||      D ]_  \  }}|j                  | t        j                        r|j	                  |         y|j
                  j                  | j
                        r`a |j	                  |        y)zvAssign block (line or table block) to contained table region ``blocks_in_tables``,
        or out-of-table ``blocks``.r   N)rk   r   r   FACTOR_MAJORrQ   rY   rZ   )r    rh   rn   ro   rp   rq   s         r   rj   zBlocks._assign_block_to_tables  sp     '*&2B&C 	!"E?~~ey/E/E~F&&u- ZZ**5::6	! MM% r   c                    
  j                   rdnd\  fdfd}dt        fd} fd}g 
g 
fd} |       } j                  D ]  }t        |t              r |        
j                  |       ,rd	   nd
}|r|j                  |      rd}	n=|j                  s|j                  rd}	n" |||      |dz   k  r|| ||      z  k  rd}	nd}	|	r |        j                  |         |        
S )zCreate text blocks by merge lines with same properties (spacing, font, size) in 
        vertical direction. At this moment, the block instance is either Line or TableBlock.
        )r|      )r   r   c                 0    | j                   }|   |   fS )z/Coordinates of block top and bottom boundaries.)rY   )r    rY   idx0idx1s     r   	get_v_bdyz0Blocks._join_lines_vertically.<locals>.get_v_bdy  s    ::D:tDz))r   c                 N     |       \  }} |      \  }}t        ||z
  d      S )Nr   )round)block1block2u0u1v0v1r   s         r   vertical_distancez8Blocks._join_lines_vertically.<locals>.vertical_distance  s/    v&FBv&FBB?"r   r^   c                     t        | j                  d       }| j                  r|j                  j                  n|j                  j
                  }t        |d      S )z-The height of line span with most characters.c                 ,    t        | j                        S r%   )r   r   )ss    r   r-   zDBlocks._join_lines_vertically.<locals>.line_height.<locals>.<lambda>  s    QVV r   r   r   )maxr   r\   rY   heightwidthr   )r^   spanr   s      r   line_heightz2Blocks._join_lines_vertically.<locals>.line_height  s@    tzz'<=D$($;$;		  AA;r   c                       j                   d         \  } }g }j                   dd D ]/  } |      \  }}|j                  t        ||z
  d             ||}} 1 |rt        ||j                        S dS )zCVertical distance with most frequency: a reference of line spacing.r   r|   Nr   r           )r1   rQ   r   r   count)ref0ref1	distancesr    r   r   r   r   s         r   common_vertical_spacingz>Blocks._join_lines_vertically.<locals>.common_vertical_spacing  s    "4??1#56JD$I, $"5)B  r$w!23d$ ;D3yioo6LLr   c                      sy t               } | j                         j                  |        j                          y r%   )r   addrQ   r   )r    ro   ra   s    r   close_text_blockz7Blocks._join_lines_vertically.<locals>.close_text_block  s0    &KEIIeMM% KKMr   rH   NFTg      ?)r\   r   r1   r*   r   rQ   in_same_rowr   )r   r   r   r  r	  r  ref_disr    ref_linestart_new_blockro   r   r   r   ra   s   `         @@@@@r   r   zBlocks._join_lines_vertically  s     $66VF
d	*
	#
	T 		M 	 *+__ 	$E
 %, "e$ ).594  8#7#7#>&+O &&(*>*>&*O 'x7D3K4III&+O '+O"$4$6U#=	$B 	r   c                 2   g }| D ]  }|j                   s|j                  |       !|j                  j                  ||      }t	        |      dk(  r|j                  |       ]|D ].  }t               }|j                  |       |j                  |       0  |S )zSplit text block into separate paragraph based on punctuation of sentence.

        .. note::
            Considered only normal reading direction, from left to right, from top
            to bottom.
        r|   )r   rQ   ra   split_vertically_by_textr   r   r  )r   r   r   ro   r    
lines_listra   
text_blocks           r   r   z#Blocks._split_text_block_vertically  s      	.E &&e$ ==>Y@^`J:!e$' .E!*JNN5)MM*-.	.$ r   c                 x    | j                   j                  }| j                  D ]  } |j                  |g|   y)a  Calculate external horizontal space for text blocks, i.e. alignment mode and left spacing 
        for paragraph in docx:
        
            - horizontal block -> take left boundary as reference
            - vertical block   -> take bottom boundary as reference
        N)r   rb   r1   parse_horizontal_spacing)r   r   rY   r    s       r   r   z&Blocks._parse_block_horizontal_spacing  s;     {{''__ 	8E*E**47$7	8r   c           	      >   | j                   j                  }| j                  rdnd}| j                  d   }||   }| j                  D ]  }|j                  r|d   d   j
                  d   dz  }nd}|j                  |   |z
  }||z
  }t        |j                  |dz      ||dz      z
  d      }	||	z  }t        |d      }|j                  r||_	        no|j                  r||_	        n[|j                  s|j                  r||_        n;t        |t        || j                  d   k7        t        j                  z        |_	        |}|j                  |dz      |z   } | j                  d   }|j                  r'|d   xj                  t        j                  z  c_        yy)a<  Calculate external vertical space for text blocks, i.e. before/after space in docx.
        
        The vertical spacing is determined by the vertical distance to previous block.
        For the first block, the reference position is top margin.

        It's easy to set before-space or after-space for a paragraph with ``python-docx``,
        so, if current block is a paragraph, set before-space for it; if current block is 
        not a paragraph, e.g. a table, set after-space for previous block (generally, 
        previous block should be a paragraph).
        r|   r   g       @r  r   rH   N)r   rb   r\   r1   r;   border_widthrY   r   r   r   r?   after_spaceintr   
MINOR_DISTr   )
r   rY   idx	ref_blockr   r    dw	start_pos
para_spacer   s
             r   r   z$Blocks._parse_block_vertical_spacing&  s    {{''
 **aOOA&	s)__ (	1E ##1Xa[--a036 

3",I"7*J UZZA&tCE{2C8B"JZ-J ""%/" ,,%/" ((I,K,K(2	% &)SPQ@R9R5ST]ThTh5h%i" InnSU+b0GQ(	1\ #r!1!1Y5I5I!I!1r   c                     d }| j                   D ]?  }|j                  s ||      rd|_        |j                          0|j	                          A y)zCalculate internal vertical space for text blocks, i.e. paragraph line spacing in docx.

        .. note::
            Run parsing block vertical spacing in advance.
        c                 x    | j                   D ]+  }t        d |j                  D              }t        |      s+ y y)zcheck line spacing type based on parsed font metrics of text span:
            exact line spacing if no standard line height is extracted
            c              3   V   K   | ]!  }t        |t              r|j                    # y wr%   )r*   r   is_valid_line_height).0r  s     r   	<genexpr>zLBlocks._parse_line_spacing.<locals>.is_exact_line_spacing.<locals>.<genexpr>{  s-      +^,0*TS[B\ 04/H/H+H +^s   ')TF)ra   r/   r   any)r    r^   absent_line_heightss      r   is_exact_line_spacingz9Blocks._parse_line_spacing.<locals>.is_exact_line_spacingv  sF      9&* +^48JJ+^ '^#*+D9 r   r   N)r1   r   line_space_typeparse_exact_line_spacingparse_relative_line_spacing)r   r)  r    s      r   r   zBlocks._parse_line_spacingp  sO    	 __ 	4E&&$U+()%..0113	4r   )NN)$__name__
__module____qualname____doc__r/   r   r
   r#   propertyr'   r2   r7   r<   r@   rD   rT   floatrg   rr   r   r   boolr   r   r   r   rd   re   staticmethodrj   r   r   r   r   r   __classcell__)r   s   @r   r   r      s   ) )  + + g g f f _ _ f f d d 4  F&> &>u &>Rd 2Id ITY IX TY z  Ad A
#9Fx7% "e ( !d !T !RV ! !$NE Nb t QV w|  :8 GJT4r   r   ) r0  r   docx.sharedr   commonr   common.Collectionr   common.sharer   r   r	   common.Blockr
   common.docxr   r   text.TextBlockr   text.TextSpanr   	text.Liner   
text.Linesr   
table.Cellr   image.ImageBlockr   table.TableBlockr   r   r"   r   r   <module>rC     sH       1 > >   D & $    ) )p	4 p	4r   