
 Vc           @   s  d  Z  df Z d j e e e   Z d d l Z d d l Z d d l Z d d l	 Z	 d d l
 Z
 d d l Z d d l Z d d l Z d d l Z d d l m Z d d l m Z d d	 l m Z y d d
 l m Z Wn! e k
 r d d l m Z n Xd d l m Z d d l m Z d d l m Z m  Z  m! Z! d d l m" Z" e j# Z# e j$ Z% e j& Z& d   Z' d   Z( dg Z) d e f d     YZ* d  e f d!     YZ+ d" d#  Z, d$   Z- d% d&  Z. d d'  Z0 d(   Z1 d)   Z2 d" d d d*  Z3 d d+  Z4 e5 d, e5 d d d d" d" e5 d e5 d-  Z6 d.   Z7 d/   Z8 d0   Z9 e: d1 k rd d2 l; m< Z< d3 Z= e< d4 e=  Z> e> j? d5 d6 d7 d8 d9 d d: d; e> j? d< d= d7 d> d? d@ d9 e5 d: dA e> j? dB dC d7 dD d9 d, d: dE e> j? dF dG d7 dH d9 d d: dI e> j? dJ dK d7 dL d9 d dM e8   d: dN dO j e8    e> j? dP dQ d7 dR d? d@ d9 e5 d: dS e> j? dT dU d7 dV d9 e5 d? d@ d: dW e> j? dX dY d7 dZ d9 d d: d[ e> j? d\ d] d7 d^ d9 d" d: d_ e> j@   \ ZA ZB eC eB  d` k  re> jD   e jE   n  eB d ZF e6 da eF eA jG  ZH eH jI db  ZH y# e jJ jK eH jL db dc dd  Wn! eM k
 re jJ jK eH  n Xe jJ jK de  n  d S(h   s,  
File: webarticle2text.py

Copyright (C) 2008 Chris Spencer (chrisspen at gmail dot com)

Attempts to locate and extract the largest cluster of text in a
webpage. It does this by walking the DOM-tree, identifying all text
segments and their depth inside the DOM, appends all text at roughly
the same depth, and then returns the chunk with the largest total
length.

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
i   i    i   t   .iN(   t   html_entities(   t
   HTMLParser(   t   http_client(   t	   cStringIO(   t   StringIO(   t   parse(   t	   HTTPError(   t   OpenerDirectort   Requestt   urlopen(   t   robotparserc         C   s,   y t  |  d  }  Wn t k
 r' |  SXd  S(   Ns   utf-8(   t   unicodet	   TypeError(   t   text(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   get_unicodeC   s    c         C   s   d   } t  j d | |   S(   s   Removes HTML or XML character references 
      and entities from a text string.
      keep &amp;, &gt;, &lt; in the source code.
   from Fredrik Lundh
   http://effbot.org/zone/re-sub.htm#unescape-html
   c         S   s   |  j  d  } | d  d k r{ yE | d  d k rL t t | d d !d   St t | d d !  SWq t k
 rw q Xn2 y t t j | d d ! } Wn t k
 r n X| S(	   Ni    i   s   &#i   s   &#xii   i   (   t   groupt   unichrt   intt
   ValueErrort   htmlentitydefst   name2codepointt   KeyError(   t   mR   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   fixupP   s    s   &#?\w+;(   t   ret   sub(   R   R   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   unescapeHTMLEntitiesI   s    	t   scriptt   stylet   optiont   ult   lit   legendt   objectt   noscriptt   labelt   footert   navt   asidet   TextExtractorc           B   sz   e  Z d  Z g  Z d g Z d   Z d   Z d   Z d   Z e	 d  Z
 d   Z d   Z d	   Z d
   Z d   Z RS(   s,  
    Attempts to extract the main body of text from an HTML document.
    
    This is a messy task, and certain assumptions about the story text
    must be made:
    
    The story text:
    1. Is the largest block of text in the document.
    2. Sections all exist at the same relative depth.
    i    c         C   sY   t  j |   t |  _ d  |  _ d  |  _ d |  _ i  |  _ d |  _	 d |  _
 d |  _ d  S(   Ni    i   (   R   t   __init__t   Falset   _ignoret   Nonet   _ignorePatht   _lasttagt   _deptht	   depthTextt   countingt   lastNt   pathBlur(   t   self(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyR)   y   s    							c         C   sk  |  j  } | j   } | t k r- t |  _  n  t |  } | j   |  _ |  j d 7_ |  j |  j g 7_ d |  _ d | k r d | d j   k r t |  _  n d | k r d | d j   k r t |  _  nq d | k rd | d j   k rt	 |  j
 d  |  _
 t |  _  n. d | k rBd | d j   k rBt |  _  n  |  j  rg| rgt |  j  |  _ n  d  S(   Ni   i    t   idR%   t	   copyrightt   class(   R+   t   lowert   IGNORED_TAGSt   Truet   dictR.   R/   t   pathR2   t   maxR1   t   tupleR-   (   R4   t   tagt   attrst   ignore0t   attrd(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_starttag   s(    		""""c         C   s   d  S(   N(    (   R4   R?   R@   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_startendtag   s    c         C   s|   |  j  r- t |  j  |  j k r- t |  _  n  |  j d 8_ t |  j  r` |  j j   |  _ n	 d |  _ |  j d 7_ d  S(   Ni   i    (	   R+   R>   R<   R-   R*   R/   t   lent   popR2   (   R4   R?   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_endtag   s    !	c         C   s   t  |  d k r |  j r | j   j   } | j d  rf |  j rf t |  _ t |  j  |  _ d  S| r t |  j |  j	   } |  j
 j | g   |  j
 | c | g 7<t |  j |  j	 d   } |  j
 j | g   |  j
 | c d | g 7<q n  d  S(   Ni    R6   i   t   #(   RE   R+   t   stripR8   t
   startswithR:   R>   R<   R-   R3   R0   t
   setdefault(   R4   t   datat   entityt   _datat   rpatht   rpath2(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_data   s    	c         C   sZ   | j    r) t t d | d   } n t t d | d   } |  j | d t d  S(   Ns   &#t   ;t   &RM   (   t   isdigitR   R   RQ   R:   (   R4   t   nameR   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_charref   s    c         C   s   |  j  |  d  S(   N(   RV   (   R4   RU   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_entityref   s    c   
      C   s  d d	 d g  f \ } } } } xt  j |  j  D]\ } } t } g  } xQ | D]I }	 t |	 j    r |	 j d  r | r qP n  t } n  | j |	  qP Wt } t	 |  } g  } xQ | D]I }	 t |	 j    r |	 j d  r | r q n  t } n  | j |	  q Wt	 |  } t
 d  j |  j d d  } | j t
 d  d  } | j t
 d  d  } t j d t
 d  |  j   } t | | | | f t |  | | | f  \ } } } } q1 W| S(
   Ni    t    RH   s   t    s   \u2019t   's   [\n\s]+(    (   t   sixt	   iteritemsR0   R:   RE   RI   RJ   R*   t   appendt   reversedt   ut   joint   replaceR   R   R=   (
   R4   t   maxLent   maxPatht   maxTextt   maxTextListR<   t   textListt   startR   t   t(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   get_plaintext   s8    		!!+c         C   s-   y t  j |  |  SWn t k
 r( d SXd  S(   Ni(   R   t   parse_endtagt   AttributeError(   R4   t   i(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyRj      s    c         C   s   d  S(   N(    (   R4   t   msg(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   error  s    (   t   __name__t
   __module__t   __doc__t   domR<   R)   RC   RD   RG   R*   RQ   RV   RW   Ri   Rj   Rn   (    (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyR(   j   s   
								(	t   HTMLParserNoFootNotec           B   sJ   e  Z d  Z d Z d g Z d   Z d   Z d   Z d   Z	 d   Z
 RS(   sG   
    Ignores link footnotes, image tags, and other useless things.
    i    c         G   s2   t  j d  |  j d g 7_ | d k r. n  d  S(   Ng      ?i    R   (   t   timet   sleepR<   (   R4   R?   R@   t   args(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyRC     s    c         G   s3   |  j  j   |  j  d c d 7<| d k r/ n  d  S(   Nii   R   (   R<   RF   (   R4   R?   Rv   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyRG     s    c         C   s   |  j  r d  |  _  n  d  S(   N(   t   anchorR,   (   R4   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt
   anchor_end  s    	c         G   s   d  S(   N(    (   R4   t   srct   altRv   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   handle_image  s    c         C   s;   |  j  r' d j |  j  j |   } n  t j |  |  d  S(   NRY   (   t   textPatternR`   t   findallR   RQ   (   R4   RL   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyRQ   "  s    	N(   Ro   Rp   Rq   R,   R|   R<   RC   RG   Rx   R{   RQ   (    (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyRs     s   					i   c         C   s   t  |   }  t |  t   s! t  t   } t j t j |   } t   } | | _ | j	 |   | j
   | j   } t j d d |  j   } t j d d |  j   } t j d d |  j   } t j d d |  j   } | S(   s*   
    Extracts text from HTML content.
    s   \s[\(\),;\.\?\!](?=\s)RY   s   [
\s]+s   \-{2,}RX   s   \.{2,}(   R   t
   isinstancet   AssertionErrorR   t	   formattert   AbstractFormattert
   DumbWriterR(   R3   t   feedt   closeRi   R   R   RI   (   t   htmlt   blurt   filet   ft   pR   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   extractFromHTML(  s    			
c         C   s|   y d d l  m } Wn' t k
 r= } t d d |   n Xi d d 6d d 6d d 6d	 d
 6} | |  d | \ } } | S(   s5   
    Runs an arbitrary HTML string through Tidy.
    i(   t   tidy_documents"   %s
You need to install pytidylib.
s   e.g. sudo pip install pytidylibi   s   output-xhtmlt   indents	   tidy-markt   raws   char-encodingt   options(   t   tidylibR   t   ImportError(   t	   dirtyHTMLR   t   eR   R   t   errors(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   tidyHTMLH  s    
s   %s.txtc         C   s0   t  j   } | j |  j d   | | j   S(   ss   
    Generates the cache key for the given string using the content in pattern
    to format the output string
    s   utf-8(   t   hashlibt   sha1t   updatet   encodet	   hexdigest(   t   st   patternt   h(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   generate_key\  s    c         C   sP   t  j j |  |  } t  j j |  rL t | d   } | j   SWd QXn  | S(   sB   
    Returns the content of a cache item or the given default
    t   rN(   t   osR<   R`   t   isfilet   opent   read(   t	   cache_dirt	   cache_keyt   defaultt   filenameR   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt	   cache_getf  s
    c         C   s>   t  j j |  |  } t | d   } | j |  Wd QXd S(   s9   
    Creates a new cache file in the cache directory
    t   wN(   R   R<   R`   R   t   write(   R   R   t   contentR   R   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt	   cache_setr  s    c         C   s;   t  j j |  |  } t  j j |  r7 t  j j |  Sd S(   sB   
    Returns the cache files mtime or 0 if it does not exists
    i    (   R   R<   R`   t   existst   getmtime(   R   R   R   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt
   cache_info{  s    c   
      C   s   i  } | r t  |  | d <n  t j |  d | d | } | r t | t t f  s[ t  t j |   \ } } | s | j	   } | j
 d  p d j d  d } n  | | k r d Sn  y | j SWn t j k
 r }	 |	 j SXd S(	   s/   
    Retrieves the raw content of the URL.
    s
   User-agentt   headerst   timeouts   Content-TypeRX   RR   i    N(   t   strt   requestst   getR~   R>   t   listR   t	   mimetypest
   guess_typet   infot	   getheadert   splitR   t   httplibt   IncompleteReadt   partial(
   t   urlR   t	   userAgentt   only_mime_typesR   t   responset   ctt   mt_encodingt   response_infoR   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   fetch  s     %c         C   s  t  j |   \ } } } } } t  j | | d d d f  }	 t |	  }
 t j   } | ri t | |
  n d } t j   d } | s t | |
  | k  r%y/ t	 |	 d | } | r t
 | |
 |  n  Wq%t k
 r!} | j d k r t | _ n! | j d k rt | _ n |  d } q%Xn  y t | d	 d
 } Wn t k
 rNn X| j d   | j d  D  d  } x0 t   j D]" \ } } | d k r| } PqqW| j | p| |   S(   Ns   /robots.txtRX   iQ i   R   i  i  i  t   encodingt   utf8c         s   s   |  ] } | Vq d  S(   N(    (   t   .0t   x(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pys	   <genexpr>  s    s   
s
   User-agenti:	 (   i  i  (   t   urlparset   urlsplitt
   urlunsplitR   R   t   RobotFileParserR   Rt   R   R   R   R   t   codeR:   t   disallow_allt	   allow_allR   R   R   R   R,   R   t
   addheaderst	   can_fetch(   R   t   useCacheR   R   t   schemet   netloct   url_patht   queryt   fragmentt   robotstxt_urlt   keyt   robots_parsert   cached_contentt	   thresholdt   het   default_useragentt   kt   v(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   check_robotstxt  sD     t   _cachec         C   s  t  |  } y d d l } Wn' t k
 rE } t d d |   n X|
 rm t |
 t  rm |
 j d  }
 n  | r t j j |  s d } t j	 | |  n  t
 |   } t | |  } | r | Sn  |	 s t |  | | d | s | r d GHn  d	 Sn  | rd
 |  GHn  t |  d | d | d |
 } | s3d	 S| st | t  r]| j d d  } n  | j |  } | d } | rd | GHqn  | rd t |  GHn  | rt
 |  d  } t | | |  n  | rt t j | j d   } x) | D] } t |  } | |  } qWn  t |  } | r<d t |  GHn  | sFd	 St | d | d d } | rh| St | d | } t | t  st  | j | d  } | rt | | |  n  | S(   s  
    Extracts text from a URL.

    Parameters:
    url := string
        Remote URL or local filename where HTML will be read.
    cache := bool
        True=store and retrieve url from cache
        False=always retrieve url from the web
    cacheDir := str
        Directory where cached url contents will be stored.
    verbose := bool
        True=print logging messages
        False=print no output
    encoding := string
        The encoding of the page contents.
        If none given, it will attempt to guess the encoding.
        See http://docs.python.org/howto/unicode.html for further info
        on Python Unicode and encoding support.
    filters := string
        Comma-delimited list of filters to apply before parsing.
    only_mime_types := list of strings
        A list of mime-types to limit parsing to.
        If the mime-type of the raw-content retrieved does not match
        one of these, a value of None will be returned.
    iNs    %s
You need to install chardet.
s   e.g. sudo pip install chardett   ,i  R   s   Request denied by robots.txtRX   s   Reading %s...R   R   R   Ra   R   s   Using encoding %s.s   Read %i characters.s   %s.raws   Extracted %i characters.R   R   t   ignore(   R   t   chardetR   R~   t
   basestringR   R   R<   t   isdirt   makedirsR   R   R   R   R   R   t   detectRE   R   t   mapR   RI   t
   get_filterR   R   R   (   R   t   cachet   cacheDirt   verboseR   t   filtersR   R   R   t   ignore_robotstxtR   R   R   R   t   cache_permsR   R   R   t   encoding_opiniont   raw_keyt   filter_namest   filter_namet   filtert   res(    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   extractFromURL  sz    '  	
   c         C   s   t  j d d |   S(   Ns   &#[a-zA-Z]+RX   (   R   R   (   R   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   filter_remove_entitiesJ  s    c          C   sD   g  t  j t    D]- \ }  } |  j d  r |  j d d  ^ q S(   Nt   filter_RX   (   R[   R\   t   globalsRJ   Ra   (   R   R   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   get_filter_namesM  s    c         C   s   t  d t j d d |    S(   NR   s
   [^a-zA-Z_]RX   (   t   evalR   R   (   RU   (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyR   T  s    t   __main__(   t   OptionParsers5   usage: %prog [options] <remote url or local filename>t   usages   -es
   --encodingt   destR   R   t   helpsA   Manually specifies the encoding to use when interpreting the url.s   -cs   --cacheR   t   actiont
   store_trues!   Stores and loads data from cache.s   -ds
   --cacheDirR   s/   The directory where cache files will be stored.s   -us   --userAgentR   s+   The user-agent to use when requesting URLs.s   -fs	   --filtersR   t   choicessG   A comma-delimited list of pre-processing filters to apply, one of [%s].t   |s   -vs	   --verboseR   s   Displays status messages.s   -is   --ignore-robotstxtR   s,   Ignore robots.txt when fetching the content.s   -ms   --only-mime-typesR   s;   A comma-delimited list of mime-types to limit retrieval to.s   -bs   --blurR   s   The number of DOM levels to include together when searching for the largest single chunk of text. A bigger number will find more text, but that text will morel likely be junk. A smaller number will find less text, but that text is less likely to be junk.i   R   s   utf-8R   R   s   
(   i   i    i   (   s   scripts   styles   optionR   s   lis   legends   objectR#   s   labels   footerR&   R'   (N   Rq   t   VERSIONR`   R   R   t   __version__R   t   sysRt   R   R   R   R   R   R[   t	   six.movesR   R   t   six.moves.html_parserR   R   R   R   R   R   t   six.moves.urllibR   R   t   six.moves.urllib.errorR   t   six.moves.urllib.requestR   R	   R
   R   R_   t	   text_typeR   R   R   R   R9   R(   Rs   R   R   R   R,   R   R   R   R   R   R*   R   R   R   R   Ro   t   optparseR   R   t   parsert
   add_optiont
   parse_argsR   Rv   RE   t
   print_helpt   exitR   t   __dict__R   t   decodet   stdoutR   R   R   (    (    (    s/   /var/www/vec2topic/vec2topic/webarticle2text.pyt   <module>   s   					 " 	
			$-m				

#