ó
„IWc           @   s¡  d  d l  m Z d  d l m Z d  d l Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 Z
 d  d l Z d  d l Z d  d l Z d  d l m Z d  d l m Z m Z d  d l Z d  d l m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Td  d l Z d  d l Z d  d l	 Z	 d  d l Z d  d l  m! Z! d  d l" Z" d	 „  Z# d
 „  Z$ e% g  d „ Z& g  d „ Z' e( d k re$ ƒ  n  d S(   iÿÿÿÿ(   t   Counter(   t   KMeansN(   t   WordNetLemmatizer(   t   TextBlobt   Word(   t   TfidfTransformert   CountVectorizer(   t   *(   t   BeautifulSoupc         C   s‡   t  j |  ƒ } yi t  j | d d ƒ} | j d k rs | j ƒ  } t | d ƒ } t j | ƒ } | j d ƒ } | Sd SWn d SXd  S(   Nt   timeouti   iÈ   t   html5libs   utf-8t   problem(	   t   urllib2t   Requestt   urlopent   codet   readR   t   webarticle2textt   extractFromHTMLt   encode(   t   urlt   requestt   html_objt   html_doct   soupt   result(    (    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pyt   read_url   s    c       
   C   sh  d }  t  j d |  d t  j ƒ t  j ƒ  } | j t  j ƒ t  j d d ƒ } | j d j | ƒ t j	 d d ƒ } | j
 d	 d
 d d d d d d t ƒt j ƒ  } | j ƒ  j } | j d ƒ d d } t | d ƒ j ƒ  } d j | ƒ g } t d j | ƒ | ƒ } | j d d ƒ | j d | ƒ | j d d ƒ t | | ƒ | j d t j ƒ  | ƒ d  S(   Ns   log_vec2topic.logt   filenamet   levels   %(asctime)s %(message)ss   %b-%d-%Y %H:%M:%Si    t   descriptions   Run Vec2Topic on a text files   -it   actiont   storet   destt   inpt   helps   text file inputedt   requiredt   .s   .csvt   rbt    R   i2   s   Running Vec2Topic on %ss   Total time: %.2f seconds(   t   loggingt   basicConfigt   INFOt	   getLoggert   setLevelt	   Formattert   handlerst   setFormattert   argparset   ArgumentParsert   add_argumentt   Truet   timet
   parse_argsR!   t   splitt   opent	   readlinest   joint   cleanupContentt   infot	   vec2topic(   t   LOG_FILENAMEt   loggert	   formattert   parsert   time1t   filert   outfilet   text(    (    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pyt   main+   s&    %c         C   s×  t  j d ƒ } y t |  d d ƒ}  Wn | j d ƒ n X| j d |  ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d	 d
 | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t j | ƒ } t } | j d | ƒ | r‚t d d ƒ  } | j	 ƒ  } Wd  QX| j
 ƒ  t g  | D] }	 t  j d d |	 ƒ ^ qƒ } g  | D]L }	 t |	 ƒ d k r-d j g  |	 j ƒ  D] }
 |
 | k rX|
 ^ qXƒ ^ q-} n: g  | D]- }	 t |	 ƒ d k r‰d j |	 j ƒ  ƒ ^ q‰} | j d t | ƒ ƒ | S(   Ns   [\w\.-]+@[\w\.-]+t   errorst   ignores   unicode errort    s   [^ -]+R&   s   \rs   \ns   . s   \ts   (?:\@|https?\://)\S+s   's   \d*.\d+s   [.]{2,}R$   s   \d+.\d+s   [~/<>()_=-]s   "s   [,:\*!#%/$+\^]s   [`\[\]\{\}\|]s   \\s   \b\d+\bs    +s   Stop words: %ss   stopwords.txtR%   s   [^\w]i   s   Sentences: %d(   t   ret   compilet   unicodeR:   t   subt   nltkt   sent_tokenizet   FalseR6   R7   t   closet   sett   lenR8   R5   (   t   inputStringR=   t   STOPt	   namesListt   email_patternt   outputStringt   outputSentencest   ft   stopt   wt   x(    (    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pyR9   D   sD    
.\:c   a         sï  yÅd } g  |  D] } t  j d d | ƒ ^ q }  t ƒ  } g  |  D]. } g  | j ƒ  D] } | j | ƒ ^ qT ^ qA } g  | D]% }	 g  |	 D] } | j ƒ  ^ q‰ ^ q| } g  | D] }	 |	 D] } | ^ q¸ q® }
 t |
 ƒ } t |
 ƒ } | j d | ƒ | d k  rd St	 d t
 d t j | j ƒ  d	 d
 ƒƒ ƒ } | j d | ƒ | j d ƒ t j d d d ƒ} t j t d d ƒ ƒ \ } } } | j d ƒ t j j j | d d d d ƒ} t | | ƒ } t t g  | D]% } | D] } d | k rÜ| ^ qÜqÒƒ ƒ } g  | D] } t  j d d | ƒ ^ q
} g  | D] }	 |	 D] } | ^ q<q2} t | ƒ } g  t t | ƒ ƒ D]$ } | | | d k rp| | ^ qp} t g  t | ƒ D] } | | rª| ^ qªƒ } g  } x¿ | D]· } g  } x› | D]“ } d | k r| j | ƒ qé| | k r'| j | ƒ qét  j d d | ƒ } | j d ƒ d } | j d ƒ d } | j | ƒ | j | ƒ qéW| j | ƒ qÖW| }  | j d ƒ g  }! xl |  D]d } g  }" t d j | ƒ ƒ }# x3 |# j D]( \ }$ }% |% dL k rÜ|" j |$ ƒ qÜqÜW|! j |" ƒ q±W| j d ƒ d }& d  }' t j j j |  d! d d" |' d# d
 d d ƒ}( | j d$ ƒ | j d% ƒ g  |! D] }) |) D] } | ^ qŠq€} t | ƒ }* i  }+ g  }, | | k  rÙ| j d& ƒ t  }- n | j d' ƒ t! }- |- rt | ƒ j" |( j# ƒ j" |* ƒ }. n! t | ƒ j" |( j# ƒ j" |* ƒ }. x} |. D]u } t | ƒ d k r>|- rt j$ t j% | t& | ƒ |( | f ƒ ƒ |+ | <n | t& | ƒ |+ | <|, j | ƒ q>q>W|  }/ g  |! D] }) |) D] } | ^ qÎqÄ} t | ƒ }* | j d( ƒ t' |+ |, d  |- d |! d) t  d* t! d d d+ d ƒ\ }0 ‰
 t( j) |0 d, d- d. d/ ƒ}1 t |0 ƒ }2 t* |1 ˆ
 |2 ƒ }3 | j d0 ƒ g  |! D] } d j | ƒ ^ q„}4 t+ d1 d2 ƒ }5 |5 j, |4 ƒ }6 i  ‰ x' |5 j- j. ƒ  D] \ }7 }8 |7 ˆ |8 <qÓWg  }9 x9 |5 j- j/ ƒ  D]( \ }7 }8 |7 |, k r|9 j |8 ƒ qqWt0 |9 ƒ ‰	 |6 d  d  … ˆ	 f }: t1 ƒ  j, |: ƒ }; |: j2 |: }< | j d3 ƒ |< j3 \ }= }> |< j4 ƒ  }? g  t t ˆ	 ƒ ƒ D] } ˆ ˆ	 | ^ q«}@ g  }A g  }B g  }C x† t5 j6 |? j7 |? j8 |? j9 ƒ D]f \ }D }E }F |F d k rö|D |E k röd }8 |8 d k r\|B j |D ƒ |C j |E ƒ |A j |8 ƒ q\qöqöWt: j; j< |A |B |C f f d4 |= |> f ƒ}G |G j= ƒ  }H ‡ ‡	 f d5 †  t t ˆ	 ƒ ƒ Dƒ }I d6 „  |I j. ƒ  Dƒ }J | j d7 ƒ |H j> d8 d ƒ }K i  ‰ x6 t t ˆ	 ƒ ƒ D]" }L t? |K |L ƒ ˆ ˆ ˆ	 |L <q		Wt
 ˆ j ƒ  ƒ ‰ t
 |3 j ƒ  ƒ ‰ ‡ ‡ f d9 †  ˆ j@ ƒ  Dƒ }M t jA d: ƒ t jA t jB |M j ƒ  ƒ ƒ ‰  ‡  f d; †  |M j/ ƒ  Dƒ ‰ ‡ f d< †  |3 j/ ƒ  Dƒ } t jA d: ƒ t jA t jB | j ƒ  ƒ ƒ ‰  ‡  f d= †  | j/ ƒ  Dƒ ‰ ‡ ‡ f d> †  ˆ j@ ƒ  Dƒ } t j
 | j ƒ  ƒ ‰ ‡ f d? †  | j/ ƒ  Dƒ }N | j d@ ƒ dA }O tC dB |O ƒ ‰ ˆ jD g  |0 D] } | ^ q›
ƒ ‡ ‡
 f dC †  t t ˆ
 ƒ ƒ Dƒ }P i  }Q g  t |O ƒ D] }D g  ^ qæ
}R g  t |O ƒ D] }D g  ^ q}S x– t |O ƒ D]ˆ }D g  t t ˆ
 ƒ ƒ D]# }L ˆ jE |L |D k r=ˆ
 |L ^ q=|R |D <g  |R |D D] } |N | ^ qu}T t jF t0 |T dD t! ƒƒ |S |D <q$Wt jG |S ƒ }U xC tH t |O ƒ ƒ D]/ \ }V }D x  |R |U |D D] } |D |Q | <qíWqÒW| j dE ƒ g  }W |O }X dF }Y x” t |X ƒ D]† }D |R |U |D }Z g  t0 g  |Z D] } | |N | g ^ qRdG tI d ƒ dD t! ƒD] } | d ^ q}[ |W j dH |D d g |[ |Y  ƒ q1Wt
 g  |W D] } t | ƒ ^ qÅƒ }\ g  }] x0 |W D]( }^ |] j |^ d g |\ t |^ ƒ ƒ qíWt t5 jJ |] Œ  ƒ } | jK t | ƒ g  t |O ƒ D]' } t? |S |U | d ƒ dI d ^ qJƒ g  t0 |N j. ƒ  dG tI d ƒ dD t! ƒD] } | d ^ q|Y  }_ | |_ | f SWn# tL k
 rê}` |` dJ dK g d f SXd  S(M   Ni'  s   [^\w]R&   s   Num of words: %did   R   i   i   t   qi2   s   Min count= %ds   Reading wiki vecss   /data/wikimodel/wiki.shelvet   flagt   rs   /data/wikimodel/wiki.pklR%   s   Running Bigramst	   min_counti   t	   thresholdt   _RG   i    s   Extracting Nounst   NNt   NNPt   NNSt   NNPSs   Word2Vec training starting...i,  i   t   workerst   sizet   iters   Word2Vec training complete...s   Creating word vecss   Using Wiki Vecs Onlys   Using Local and Wiki Vecss   Clustering for depth...t   repeatt
   normalizedt   l2_thresholdt   methodt   averaget   metrict   cosines   Computing co-occurence grapht   token_patternu   (?u)\b([^\s]+)s   Computing degreet   shapec            s!   i  |  ] } ˆ  ˆ | | “ q S(    (    (   t   .0RZ   (   t   id2wordt   sort_ids(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>&  s   	 c         S   s   i  |  ] \ } } | | “ q S(    (    (   Rr   t   keyt   value(    (    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>'  s   	 s   Computing metricst   axisc            s;   i  |  ]1 } t  j d  ˆ  | ƒ t  j d  ˆ ƒ | “ q S(   i   (   t   npt   log(   Rr   RZ   (   t   degt   max_deg(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>3  s   	 g      à?c            s#   i  |  ] \ } } | ˆ  | “ q S(    (    (   Rr   Ru   Rv   (   t   alpha(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>5  s   	 c            s'   i  |  ] \ } } | d  ˆ  | “ q S(   g      ð?(    (   Rr   Ru   Rv   (   t	   max_depth(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>7  s   	 c            s#   i  |  ] \ } } | ˆ  | “ q S(    (    (   Rr   Ru   Rv   (   R|   (    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>9  s   	 c            s%   i  |  ] } ˆ  | ˆ | | “ q S(    (    (   Rr   Ru   (   t   deg_modt	   depth_mod(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>;  s   	 c            s'   i  |  ] \ } } | d  ˆ  | “ q S(   g      ð?(    (   Rr   Ru   Rv   (   t
   max_metric(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>=  s   	 s   Running K-meansi
   t
   n_clustersc            s$   i  |  ] } ˆ  j  | ˆ | “ q S(    (   t   labels_(   Rr   R[   (   t   kmeanst   word_d2v(    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pys
   <dictcomp>E  s   	 t   reverses   Done...Writing topicsi   Ru   s   Topic %dg      ð?t   at   b(   Rb   Rc   Rd   Re   (M   RH   RK   R   R5   t	   lemmatizet   lowerR    RQ   R:   t   mint   maxRx   t
   percentilet   valuest   shelveR6   t   cPicklet   loadt   gensimt   modelst   phrasest   Phrasest   listRP   t   xranget   appendt	   partitionR   R8   t   tagst   word2vect   Word2VecRN   R2   t   intersectiont   vocabt   arrayt   concatenatet   strt   create_word_listt   fastclustert   linkaget   calculate_depthR   t   fit_transformt   vocabulary_t   itemst	   iteritemst   sortedR   t   TRq   t   tocoot	   itertoolst   izipt   rowt   colt   datat   spt   sparset
   coo_matrixt   tocsct   sumt   intt   iterkeysRy   t   medianR   t   fitR‚   t   meant   argsortt	   enumeratet
   itemgettert   izip_longestt   insertt	   Exception(a   t
   inpContentR=   RB   t   local_vec_thresholdt   tempt   wordnet_lemmatizert   XRZ   t   sentences_lemRa   t	   flat_sentt	   word_freqt	   num_wordsR_   t   model_wiki_vect   model_wiki_vocab_lowercaset   wiki_bigram_wordt
   wiki_existt   bigramt   sentences_bigramst   bigram_listt   bigram_joinedt   wordst   bigram_freqt   to_joint   wiki_bigram_word_commont   sentences_bigrammed_tempt   sentt   new_sentt   new_wt   new_w1t   new_w2t   sentences_bigrammedt   sentences_nounst   nounst   blobt   wordt   tagt   dim_wikit   dim_datat   model_wRC   t   Vocabt
   model_combt   model_comb_vocabt	   local_vect   common_vocabt	   sentencest   data_d2vt	   spclustert
   num_pointst   depthRª   t   cvt
   bow_matrixRu   Rv   t   idst   bow_reducedRj   t   similarity_graph_reducedt   mt   nt   cxt   keyzR°   t   rot   cot   it   jt   vt   SSt   SP_fullt   id_wordt   word_idt   degsumR[   t   temp_deg_modRn   t   Kt   kmeans_labelt   kmeans_label_rankedt   topict   clust_deptht
   temp_scoret   indext   numt   listert   to_showt   to_show_wordst   topt   sort_topt   max_lent   new_listt   list_elt   sorted_wordst   e(    (   R|   Rz   R~   R   Rs   Rƒ   R{   R}   R€   Rt   R„   s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pyR;   o   sJ   (	;2&-!!>(&=/-&	$!3&;"-.*% ++ %@!%R$%&J?t   __main__()   t   collectionsR    t   sklearn.clusterR   t   numpyRx   t   scipyR±   R¬   t   jsonR/   R‘   RH   RL   t	   nltk.stemR   t   textblobR   R   R   t   sklearn.feature_extraction.textR   R   R'   R3   t   sklearnR¢   t   networkxt   utilsRŽ   t   csvR   t   bs4R   R   R   RD   RN   R9   R;   t   __name__(    (    (    s2   /var/www/html/vec2topic/vec2topic/vec2topic_app.pyt   <module>   s6   
		+þ