
    yj+                         d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ  ej        e          Z G d de          Z G d d          Z G d d          Z  G d d          Z! G d d          Z"d&de#de$de%e#         fdZ&	 d'dede#de#de#dz  de%e         f
dZ'ej(        dddfd ed!e#de$d"e)e#ef         dz  d#ej*        dz  d$e j         dz  de%e)e#ef                  fd%Z+dS )(    N)BytesIO)AnyProtocol)
UploadFile)generate)Integerselect)AsyncSession)schemas)settings)FileProcessingErrorUnsupportedFileTypeErrorValidationExceptionMessagec                   .    e Zd ZdedefdZdedefdZdS )FileProcessorcontentreturnc                 
   K   d S N )selfr   s     7/DATA/AppData/hermes/projects/honcho/src/utils/files.pyextract_textzFileProcessor.extract_text   s
            content_typec                     d S r   r   r   r   s     r   supports_file_typez FileProcessor.supports_file_type   s      r   N)__name__
__module____qualname__bytesstrr   boolr    r   r   r   r   r      s<        <%<C<<<<@s@t@@@@@@r   r   c                   .    e Zd ZdedefdZdedefdZdS )PDFProcessorr   r   c                     |dk    S )Nzapplication/pdfr   r   s     r   r    zPDFProcessor.supports_file_type   s    000r   r   c                 x  K   dd l }|                    t          |                    5 }g }t          |j                  D ]M\  }}|                                }|r2|                                r|                    d|dz    d|            Nd                    |          cd d d            S # 1 swxY w Y   d S )Nr   z[Page    z]


)	
pdfplumberopenr   	enumeratepagesr   stripappendjoin)r   r   r-   
pdf_reader
text_partspage_numpagetexts           r   r   zPDFProcessor.extract_text    s     __WW--.. 	+*$&J"+J,<"="= H H$((** HDJJLL H%%&Fx!|&F&F&F&FGGG;;z**	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+s   A9B//B36B3Nr!   r"   r#   r%   r&   r    r$   r   r   r   r   r(   r(      sX        1s 1t 1 1 1 1	+% 	+C 	+ 	+ 	+ 	+ 	+ 	+r   r(   c                   .    e Zd ZdedefdZdedefdZdS )TextProcessorr   r   c                 ,    |                     d          S )Nztext/)
startswithr   s     r   r    z TextProcessor.supports_file_type-   s    &&w///r   r   c                 |   K   dD ])}	 |                     |          c S # t          $ r Y &w xY wt          d          )N)utf-8zutf-16zlatin-1zCould not decode text file)decodeUnicodeDecodeError
ValueError)r   r   encodings      r   r   zTextProcessor.extract_text0   sa      6 	 	H~~h/////%   5666s   
,,Nr9   r   r   r   r;   r;   ,   sX        0s 0t 0 0 0 07% 7C 7 7 7 7 7 7r   r;   c                   .    e Zd ZdedefdZdedefdZdS )JSONProcessorr   r   c                     |dk    S )Nzapplication/jsonr   r   s     r   r    z JSONProcessor.supports_file_type;   s    111r   r   c                 L  K   dd l }	 |                    d          }n"# t          $ r}t          d          |d }~ww xY w|                                sdS 	 |                    |          }n"# |j        $ r}t          d          |d }~ww xY w|                    |d          S )Nr   r?   z"JSON uploads must be UTF-8 encoded zUploaded JSON is invalidF)ensure_ascii)jsonr@   rA   r   r1   loadsJSONDecodeErrordumps)r   r   rJ   decoded_contentexcdatas         r   r   zJSONProcessor.extract_text>   s      	U%nnW55OO! 	U 	U 	U%&JKKQTT	U $$&& 	2	K::o..DD# 	K 	K 	K%&@AAsJ	K zz$Uz333s(    
=8=A- -
B7BBNr9   r   r   r   rE   rE   :   sX        2s 2t 2 2 2 24% 4C 4 4 4 4 4 4r   rE   c                   :    e Zd Zd ZdedefdZdededz  fdZdS )FileProcessingServicec                 b    t                      t                      t                      g| _        d S r   )r(   r;   rE   
processors)r   s    r   __init__zFileProcessingService.__init__S   s$    NNOOOO0
r   filer   c                 4  K   |                                  d{V }|                    d           d{V  |                     |j        pd          }|s)t	          d|j         dd | j        D                        |                    |           d{V S )z7Extract text from uploaded file without saving to disk.Nr   rH   zUnsupported file type: z. Supported types: c                 &    g | ]}|j         j        S r   )	__class__r!   ).0ps     r   
<listcomp>zBFileProcessingService.extract_text_from_upload.<locals>.<listcomp>e   s     PPPjkQRQ\QePPPr   )readseek_get_processorr   r   rT   r   )r   rV   r   	processors       r   extract_text_from_uploadz.FileProcessingService.extract_text_from_upload[   s      		###### iill''(9(?R@@	 	* B$*;  B  BPPoso~PPP  B  B   ++G444444444r   r   Nc                 L    | j         D ]}|                    |          r|c S d S r   )rT   r    )r   r   r`   s      r   r_   z$FileProcessingService._get_processorj   s?     	! 	!I++L99 !    !tr   )	r!   r"   r#   rU   r   r%   ra   r   r_   r   r   r   rR   rR   R   sl        
 
 
5: 5# 5 5 5 53 =43G      r   rR   \  r8   	max_charsr   c                    t          |           |k    r| gS g }d}|t          |           k     r||z   }|t          |           k    r|                    | |d                    nj|}dD ]3}|                     |||          }||k    r|t          |          z   } n4|                    | ||                    |}|t          |           k     |S )z6Split text into chunks that fit within message limits.r   N)r,   
z.  )lenr2   rfind)r8   rd   chunkscurrent_posend_pos	break_pos	delimiterlast_delimiters           r   split_text_into_chunksrp   q   s    
4yyIvFK
D		
!
!	)c$iiMM${||,--- 	2 	 	I!ZZ	;HHN++*S^^;	 , 	d;y01222# D		
!
!& Mr   dbworkspace_namefile_idsession_namec           	      .  K   ddl m}m} ddlm} t          |                               ||j        |k    |                    |j	        d          |k                        }|r|                    |j
        |k              }|                    |                    |j	        d                              t                              }|                     |           d{V }t          |                                                                          S )zAGet all messages for a specific document, ordered by chunk_index.r   )and_funcr   rs   chunk_indexN)
sqlalchemyrv   rw   
src.modelsr   r	   whererr   jsonb_extract_path_textinternal_metadatart   order_bycastr   executelistscalarsall)	rq   rr   rs   rt   rv   rw   r   queryresults	            r   get_file_messagesr      s0      &%%%%%%%""""""7OO!!"n4(()BINN	
 	
 E  BG0L@AA NN$$W%>NNSS	
 	
 E ::e$$$$$$$$F  $$&&'''r   rV   peer_idmetadataconfiguration
created_atc                   K   t                      }g }|                    |            d{V }t          ||          }	t                      }
t	          |	          D ]\  }}|pd}t          j        |||||          }|
| j        |t          |	          | j	        | j
        ||z  t          |dz   |z  t          |                    gd}|                    ||d           |st                      |S )a  
    Process an uploaded file and prepare message creation data.

    This function extracts text from a file, splits it into chunks, and prepares
    the data needed to create messages.

    Args:
        file: Uploaded file to process
        peer_id: ID of the peer creating the messages
        max_chars: Maximum characters per message chunk
        metadata: Optional metadata to associate with all messages created from this file
        configuration: Optional configuration to associate with all messages created from this file
        created_at: Optional created_at timestamp to use for all messages created from this file

    Returns:
        List of dictionaries containing message_create and file_metadata

    Raises:
        HTTPException: If file processing fails
    N)rd   rH   )r   r   r   r   r   r+   )rs   filenamerx   total_chunksoriginal_file_sizer   chunk_character_range)message_createfile_metadata)rR   ra   rp   generate_nanoidr/   r   MessageCreater   rh   sizer   minr2   r   )rV   r   rd   r   r   r   file_processorall_message_dataextracted_textrj   rs   ichunkmessage_contentr   r   s                   r   !process_file_uploads_for_messagesr      sM     : +,,N-/ *BB4HHHHHHHHN $NiHHHFGf%%  
  
5+2 !.#'!
 
 
 KK"&) -IQUi'^)<)<==&
 
 	"0!. 	
 	
 	
 	
  $!###r   )rc   r   ),datetimeloggingior   typingr   r   fastapir   nanoidr   r   ry   r   r	   sqlalchemy.ext.asyncior
   srcr   
src.configr   src.exceptionsr   r   r   src.schemasr   	getLoggerr!   loggerr   r(   r;   rE   rR   r%   intr   rp   r   MAX_MESSAGE_SIZEdictMessageConfigurationr   r   r   r   <module>r      s                                 . . . . . . & & & & & & & & / / / / / /                     
       		8	$	$A A A A AH A A A
+ + + + + + + + 7 7 7 7 7 7 7 74 4 4 4 4 4 4 40       >   c    D  $	( ((( ( *	(
 
']( ( ( (H .&*9=+/L L
LL L 38nt#	L
 /$6L !D(L 
$sCx.L L L L L Lr   