ó
ŠiõVc           @   s¡  d  d l  m Z d  d l m Z d  d l Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 Z
 d  d l Z d  d l Z d  d l Z d  d l m Z d  d l m Z m Z d  d l Z d  d l m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Td  d l Z d  d l Z d  d l	 Z	 d  d l Z d  d l  m! Z! d  d l" Z" d	 „  Z# d
 „  Z$ e% g  d „ Z& g  d „ Z' e( d k re$ ƒ  n  d S(   iÿÿÿÿ(   t   Counter(   t   KMeansN(   t   WordNetLemmatizer(   t   TextBlobt   Word(   t   TfidfTransformert   CountVectorizer(   t   *(   t   BeautifulSoupc         C   s‡   t  j |  ƒ } yi t  j | d d ƒ} | j d k rs | j ƒ  } t | d ƒ } t j | ƒ } | j d ƒ } | Sd SWn d SXd  S(   Nt   timeouti   iÈ   t   html5libs   utf-8t   problem(	   t   urllib2t   Requestt   urlopent   codet   readR   t   webarticle2textt   extractFromHTMLt   encode(   t   urlt   requestt   html_objt   html_doct   soupt   result(    (    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pyt   read_url   s    c       
   C   sP  d }  t  j d |  d t  j ƒ t  j ƒ  } | j t  j ƒ t  j d d ƒ } | j d j | ƒ t j	 d d ƒ } | j
 d	 d
 d d d d d d t ƒt j ƒ  } | j ƒ  j } | j d ƒ d d } t | d ƒ j ƒ  } d j | ƒ g } | j d d ƒ | j d | ƒ | j d d ƒ t | | ƒ | j d t j ƒ  | ƒ d  S(   Ns   log_vec2topic.logt   filenamet   levels   %(asctime)s %(message)ss   %b-%d-%Y %H:%M:%Si    t   descriptions   Run Vec2Topic on a text files   -it   actiont   storet   destt   inpt   helps   text file inputedt   requiredt   .s   .csvt   rbt    R   i2   s   Running Vec2Topic on %ss   Total time: %.2f seconds(   t   loggingt   basicConfigt   INFOt	   getLoggert   setLevelt	   Formattert   handlerst   setFormattert   argparset   ArgumentParsert   add_argumentt   Truet   timet
   parse_argsR!   t   splitt   opent	   readlinest   joint   infot	   vec2topic(   t   LOG_FILENAMEt   loggert	   formattert   parsert   time1t   filert   outfilet   text(    (    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pyt   main+   s$    %c         C   s×  t  j d ƒ } y t |  d d ƒ}  Wn | j d ƒ n X| j d |  ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d	 d
 | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t j | ƒ } t } | j d | ƒ | r‚t d d ƒ  } | j	 ƒ  } Wd  QX| j
 ƒ  t g  | D] }	 t  j d d |	 ƒ ^ qƒ } g  | D]L }	 t |	 ƒ d k r-d j g  |	 j ƒ  D] }
 |
 | k rX|
 ^ qXƒ ^ q-} n: g  | D]- }	 t |	 ƒ d k r‰d j |	 j ƒ  ƒ ^ q‰} | j d t | ƒ ƒ | S(   Ns   [\w\.-]+@[\w\.-]+t   errorst   ignores   unicode errort    s   [^ -]+R&   s   \rs   \ns   . s   \ts   (?:\@|https?\://)\S+s   's   \d*.\d+s   [.]{2,}R$   s   \d+.\d+s   [~/<>()_=-]s   "s   [,:\*!#%/$+\^]s   [`\[\]\{\}\|]s   \\s   \b\d+\bs    +s   Stop words: %ss   stopwords.txtR%   s   [^\w]i   s   Sentences: %d(   t   ret   compilet   unicodeR9   t   subt   nltkt   sent_tokenizet   FalseR6   R7   t   closet   sett   lenR8   R5   (   t   inputStringR<   t   STOPt	   namesListt   email_patternt   outputStringt   outputSentencest   ft   stopt   wt   x(    (    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pyt   cleanupContentD   sD    
.\:c   b         s‘  d } g  |  D] } t  j d d | ƒ ^ q }  t ƒ  } g  |  D]. } g  | j ƒ  D] } | j | ƒ ^ qQ ^ q> } g  | D]% }	 g  |	 D] } | j ƒ  ^ q† ^ qy } g  | D] }	 |	 D] } | ^ qµ q« }
 t |
 ƒ } t |
 ƒ } | j d | ƒ | d k  rd St	 d t
 d t j | j ƒ  d	 d
 ƒƒ ƒ } t	 d | d ƒ } | j d | ƒ | j d ƒ t j d ƒ } t j t d d ƒ ƒ \ } } | j d ƒ t j j j | d | d | ƒ} t | | ƒ } t t g  | D]% } | D] } d | k rã| ^ qãqÙƒ ƒ } g  | D] } t  j d d | ƒ ^ q} g  | D] }	 |	 D] } | ^ qCq9} t | ƒ } g  t t | ƒ ƒ D]$ } | | | d k rw| | ^ qw} t | ƒ j | ƒ } g  } xê | D]â } g  } xÆ | D]¾ } d | k rø| j | ƒ qÖ| | k r#| j t  j d d | ƒ ƒ qÖ| | k r?| j | ƒ qÖt  j d d | ƒ } | j d ƒ d } | j d ƒ d } | j | ƒ | j | ƒ qÖW| j | ƒ qÃW| }  | j d ƒ g  }! xl |  D]d } g  }" t d j | ƒ ƒ }# x3 |# j D]( \ }$ }% |% dG k rô|" j |$ ƒ qôqôW|! j |" ƒ qÉW| j d ƒ d }& d }' t j j j  |  d d d  |' d! d
 d d d | ƒ}( | j d" ƒ | j d# ƒ g  |! D] }) |) D] } | ^ qªq } t | ƒ }* t | ƒ j |( j! ƒ j |* ƒ }+ i  }, g  }- | | k  r| j d$ ƒ t" }. n | j d% ƒ t# }. x} |+ D]u } t | ƒ d k r4|. r…t j$ t j% | t& | ƒ |( | f ƒ ƒ |, | <n | t& | ƒ |, | <|- j | ƒ q4q4W|  }/ g  |! D] }) |) D] } | ^ qÄqº} t | ƒ }* | j d& ƒ t' |, |- d |. d |! d' t" d( t# d d d) d ƒ\ }0 ‰
 t( j) |0 d* d+ d, d- ƒ}1 t |0 ƒ }2 t* |1 ˆ
 |2 ƒ }3 | j d. ƒ g  |! D] } d j | ƒ ^ qz}4 t+ d/ d0 ƒ }5 |5 j, |4 ƒ }6 i  ‰ x' |5 j- j. ƒ  D] \ }7 }8 |7 ˆ |8 <qÉWg  }9 x9 |5 j- j/ ƒ  D]( \ }7 }8 |7 |- k rù|9 j |8 ƒ qùqùWt0 |9 ƒ ‰	 |6 d  d  … ˆ	 f }: t1 ƒ  j, |: ƒ }; |: j2 |: }< | j d1 ƒ |< j3 \ }= }> |< j4 ƒ  }? g  t t ˆ	 ƒ ƒ D] } ˆ ˆ	 | ^ q¡}@ g  }A g  }B g  }C x† t5 j6 |? j7 |? j8 |? j9 ƒ D]f \ }D }E }F |F d k rì|D |E k rìd2 }8 |8 d k rR|B j |D ƒ |C j |E ƒ |A j |8 ƒ qRqìqìWt: j; j< |A |B |C f f d3 |= |> f ƒ}G |G j= ƒ  }H ‡ ‡	 f d4 †  t t ˆ	 ƒ ƒ Dƒ }I d5 „  |I j. ƒ  Dƒ }J | j d6 ƒ |H j> d7 d2 ƒ }K i  ‰ x6 t t ˆ	 ƒ ƒ D]" }L t? |K |L ƒ ˆ ˆ ˆ	 |L <qÿWt
 ˆ j ƒ  ƒ ‰ t
 |3 j ƒ  ƒ ‰ ‡ ‡ f d8 †  ˆ j@ ƒ  Dƒ }M t jA d9 ƒ t jA t jB |M j ƒ  ƒ ƒ ‰  ‡  f d: †  |M j/ ƒ  Dƒ ‰ ‡ f d; †  |3 j/ ƒ  Dƒ } t jA d9 ƒ t jA t jB | j ƒ  ƒ ƒ ‰  ‡  f d< †  | j/ ƒ  Dƒ ‰ ‡ ‡ f d= †  ˆ j@ ƒ  Dƒ } t j
 | j ƒ  ƒ ‰ ‡ f d> †  | j/ ƒ  Dƒ }N | j d? ƒ d }O tC d@ |O ƒ ‰ ˆ jD g  |0 D] } | ^ q‘
ƒ ‡ ‡
 f dA †  t t ˆ
 ƒ ƒ Dƒ }P i  }Q g  t |O ƒ D] }D g  ^ qÜ
}R g  t |O ƒ D] }D g  ^ qû
}S x– t |O ƒ D]ˆ }D g  t t ˆ
 ƒ ƒ D]# }L ˆ jE |L |D k r3ˆ
 |L ^ q3|R |D <g  |R |D D] } |N | ^ qk}T t jF t0 |T dB t# ƒƒ |S |D <qWt jG |S ƒ }U xC tH t |O ƒ ƒ D]/ \ }V }D x  |R |U |D D] } |D |Q | <qãWqÈW| j dC ƒ g  }W |O }X dD }Y x” t |X ƒ D]† }D |R |U |D }Z g  t0 g  |Z D] } | |N | g ^ qHdE tI d2 ƒ dB t# ƒD] } | d ^ qw}[ |W j dF |D d2 g |[ |Y  ƒ q'Wt
 g  |W D] } t | ƒ ^ q»ƒ }\ g  }] x0 |W D]( }^ |] j |^ d g |\ t |^ ƒ ƒ qãWt t5 jJ |] Œ  ƒ } g  t0 |N j. ƒ  dE tI d2 ƒ dB t# ƒD] } | d ^ qI|Y  }_ tK |H ˆ	 ˆ |N |Q ƒ \ }` }a | |_ | |` f S(H   Niˆ  s   [^\w]R&   s   Num of words: %did   R   i   i   t   qi2   i
   s   Min count= %ds   Reading wiki vecss   /data/wikimodel/wiki.shelves%   /data/wikimodel/wiki_bigram_nouns.pklR%   s   Running Bigramst	   min_countt	   thresholdt   _RF   i    s   Extracting Nounst   NNt   NNPt   NNSt   NNPSs   Word2Vec training starting...i,  i   t   workerst   sizet   iters   Word2Vec training complete...s   Creating word vecss   Using Wiki Vecs Onlys   Using Local and Wiki Vecss   Clustering for depth...t   repeatt
   normalizedt   l2_thresholdt   methodt   averaget   metrict   cosines   Computing co-occurence grapht   token_patternu   (?u)\b([^\s]+)s   Computing degreei   t   shapec            s!   i  |  ] } ˆ  ˆ | | “ q S(    (    (   t   .0RY   (   t   id2wordt   sort_ids(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>  s   	 c         S   s   i  |  ] \ } } | | “ q S(    (    (   Rp   t   keyt   value(    (    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>   s   	 s   Computing metricst   axisc            s;   i  |  ]1 } t  j d  ˆ  | ƒ t  j d  ˆ ƒ | “ q S(   i   (   t   npt   log(   Rp   RY   (   t   degt   max_deg(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>,  s   	 g      à?c            s#   i  |  ] \ } } | ˆ  | “ q S(    (    (   Rp   Rs   Rt   (   t   alpha(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>.  s   	 c            s'   i  |  ] \ } } | d  ˆ  | “ q S(   g      ð?(    (   Rp   Rs   Rt   (   t	   max_depth(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>0  s   	 c            s#   i  |  ] \ } } | ˆ  | “ q S(    (    (   Rp   Rs   Rt   (   Rz   (    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>2  s   	 c            s%   i  |  ] } ˆ  | ˆ | | “ q S(    (    (   Rp   Rs   (   t   deg_modt	   depth_mod(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>4  s   	 c            s'   i  |  ] \ } } | d  ˆ  | “ q S(   g      ð?(    (   Rp   Rs   Rt   (   t
   max_metric(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>6  s   	 s   Running K-meanst
   n_clustersc            s$   i  |  ] } ˆ  j  | ˆ | “ q S(    (   t   labels_(   Rp   RZ   (   t   kmeanst   word_d2v(    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pys
   <dictcomp>>  s   	 t   reverses   Done...Writing topicsi   Rs   s   Topic %d(   R`   Ra   Rb   Rc   (L   RG   RJ   R   R5   t	   lemmatizet   lowerR    RP   R9   t   mint   maxRv   t
   percentilet   valuest   shelveR6   t   cPicklet   loadt   gensimt   modelst   phrasest   Phrasest   listRO   t   xranget   intersectiont   appendt	   partitionR   R8   t   tagst   word2vect   Word2Vect   vocabRM   R2   t   arrayt   concatenatet   strt   create_word_listt   fastclustert   linkaget   calculate_depthR   t   fit_transformt   vocabulary_t   itemst	   iteritemst   sortedR   t   TRo   t   tocoot	   itertoolst   izipt   rowt   colt   datat   spt   sparset
   coo_matrixt   tocsct   sumt   intt   iterkeysRw   t   medianR   t   fitR€   t   meant   argsortt	   enumeratet
   itemgettert   izip_longestt   grapher(b   t
   inpContentR<   RA   t   local_vec_thresholdt   tempt   wordnet_lemmatizert   XRY   t   sentences_lemR_   t	   flat_sentt	   word_freqt	   num_wordsR]   R^   t   model_wiki_vect   model_wiki_vocab_lowercaset   wiki_bigram_wordt   bigramt   sentences_bigramst   bigram_listt   bigram_joinedt   wordst   bigram_freqt   to_joint   wiki_bigram_word_commont   sentences_bigrammed_tempt   sentt   new_sentt   new_wt   new_w1t   new_w2t   sentences_bigrammedt   sentences_nounst   nounst   blobt   wordt   tagt   dim_wikit   dim_datat   model_wRB   t   Vocabt   common_vocabt
   model_combt   model_comb_vocabt	   local_vect	   sentencest   data_d2vt	   spclustert
   num_pointst   depthR¦   t   cvt
   bow_matrixRs   Rt   t   idst   bow_reducedRh   t   similarity_graph_reducedt   mt   nt   cxt   keyzR¬   t   rot   cot   it   jt   vt   SSt   SP_fullt   id_wordt   word_idt   degsumRZ   t   temp_deg_modRl   t   Kt   kmeans_labelt   kmeans_label_rankedt   topict   clust_deptht
   temp_scoret   indext   numt   listert   to_showt   to_show_wordst   topt   sort_topt   max_lent   new_listt   list_elt   sorted_wordst	   json_dataR   (    (   Rz   Rx   R|   R}   Rq   R   Ry   R{   R~   Rr   R‚   s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pyR:   o   sF   (	;2&-!>(&=5&!	3&;"-.*% ++ %@!%R$%&?t   __main__()   t   collectionsR    t   sklearn.clusterR   t   numpyRv   t   scipyR­   R¨   t   jsonR/   R   RG   RK   t	   nltk.stemR   t   textblobR   R   R‹   t   sklearn.feature_extraction.textR   R   R'   R3   t   sklearnRž   t   networkxt   utilsRŠ   t   csvR   t   bs4R   R   R   RC   RM   R[   R:   t   __name__(    (    (    s8   /var/www/html/vec2topic_graph/vec2topic/vec2topic_app.pyt   <module>   s6   
		+ô