BERT Functions

BERT Functions

I define two functions to run Bert models. First part processes a single text document into a format that is recognizable by BERT. The second part uses the tokenized text to generate embedding values using pre-trained BERT models.

from transformers import BertModel, BertTokenizer, AutoTokenizer
import numpy as np
import streamlit as st
import re
import pandas as pd
from datetime import datetime
import nltk
import torch
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
#input is "light.csv" which does not include stop words. 
df = pd.read_csv('../../../data/processed/paragraph.csv')
# Filter
timestamps = df.year.to_list()
texts = df.text.to_list()
text = texts[1]
df.head(1)
Unnamed: 0 ccode_iso session year paragraph_index text
0 1 AFG 7 1952 1 I consider it a great honour and privilege to ...
print(type(text))
<class 'str'>

Define functions

def bert_preprocess(text):
    """
    Preprocesses a document into a BERT-recognizable format 
    Input: text in a string format
    output: three objects ready to be used for Bert modeling 
        marked_text (list)
        indexed_tokens(list)
        attention_mask(list)
    
    """
    # Tokenize the text
    tokenized_text = tokenizer.tokenize(text)
    truncate_length = len(tokenized_text) - 512 + 2  # +2 to account for [CLS] and [SEP]
    
    # Truncate the beginning and end of the text
    truncated_text = tokenized_text[truncate_length//2 : -truncate_length//2]
    
    # Add padding
    
    # Add special tokens [CLS] and [SEP], convert tokens to ids, and create attention mask
    marked_text = ["[CLS] "] + truncated_text + [" [SEP]"]
    indexed_tokens = tokenizer.convert_tokens_to_ids(marked_text)
    attention_mask = [1] * len(indexed_tokens)

    # Pad sequences to max_seq_length
    if len(indexed_tokens) < 512:
        indexed_tokens.append(0)
        attention_mask.append(0)
    
    return marked_text, indexed_tokens, attention_mask
marked_text, indexed_tokens, attention_mask = bert_preprocess(text)
help(get_bert_embeddings)
Help on function get_bert_embeddings in module __main__:

get_bert_embeddings(marked_text, indexed_tokens, attention_mask)
    input: processed text
    output: dataframe of embedding weights for each token 
        ex) dimension of 512*768 where row represents token, column represents bert features
def get_bert_embeddings(marked_text, indexed_tokens, attention_mask):
    """
    Generates embedding values for tokenized text 
    input: processed text, indexed_tokens and attention mask (all in list format)
    output: dataframe of embedding weights for each token 
        ex) dimension of 512*768 where row represents token, column represents bert features
    
    """
    # Convert lists to PyTorch tensors
    tokens_tensors = torch.tensor([indexed_tokens])
    attention_masks = torch.tensor([attention_mask])
    
    with torch.no_grad():
        #Run the embedding
        outputs = model(input_ids=tokens_tensors.view(-1, tokens_tensors.size(-1)), 
                        attention_mask=attention_masks.view(-1, attention_masks.size(-1)))

        # Extract the hidden states 
        hidden_states = outputs[2][0].squeeze().numpy()
        
        # Convert to data frame
        pd_words = pd.Series(marked_text, name='term')
        df_outputs = pd.DataFrame(hidden_states)
        df_outputs['term'] = pd_words
        
        # Move 'term' column to the first position
        df_outputs = df_outputs[['term'] + [col for col in df_outputs.columns if col != 'term']]
        
        # Remove duplicate tokens by averaging them out
        df_outputs_embedding = df_outputs.groupby(['term']).mean()
    return df_outputs_embedding
get_bert_embeddings(marked_text, indexed_tokens, attention_mask)
0 1 2 3 4 5 6 7 8 9 ... 758 759 760 761 762 763 764 765 766 767
term
[SEP] -0.599731 -0.287527 0.995737 -0.067600 -0.116662 -0.319243 -0.035646 -0.550722 -0.154269 0.226906 ... 0.433558 0.360281 0.753348 -1.155800 0.198939 0.126193 0.058285 0.035218 -0.225301 -0.376395
##s -0.082928 0.064610 0.062934 1.201868 0.416490 -0.351008 -0.419693 0.793464 -0.682201 -0.435875 ... -1.640653 -0.082774 1.440754 0.477181 0.555801 0.517778 0.029644 0.167330 -0.804072 1.100950
, 0.238827 -0.499530 -0.229385 -0.420359 0.382101 -0.133325 -0.423249 -0.133761 0.079275 -0.810453 ... -0.476354 0.310680 -0.071447 -0.350534 -0.166876 -0.152760 0.157087 0.182910 -0.305537 0.119838
- 0.211686 -0.337158 -0.282966 -0.379349 0.213550 -0.254544 -0.361127 -0.094978 0.072562 -0.825429 ... -0.457777 0.271165 0.238502 -0.122299 -0.090638 -0.070707 0.017792 -0.049120 -0.269647 0.226714
. 0.117108 -0.388444 -0.088623 0.064858 0.523230 -0.428734 -0.267266 -0.420127 0.190312 -0.766927 ... -0.490892 0.354983 -0.355035 -0.418002 0.253672 0.086620 0.108094 -0.150912 -0.198612 0.161442
In 1.179606 0.055646 0.182922 1.080442 0.191964 -0.443555 -0.347383 0.757875 -0.356111 -1.096234 ... 0.154227 0.468894 0.522242 -0.500889 0.947098 1.150616 -0.767531 -0.148597 -0.223548 0.134304
Nations 0.006905 -0.580956 0.575177 -1.220215 0.371888 -0.279317 0.700901 -0.903072 0.631587 -0.739470 ... 0.495864 0.132003 0.067810 0.293461 0.424441 0.552663 0.318386 -0.620372 -0.583365 0.048380
United -0.077556 -0.105724 1.057649 -0.423865 0.314821 -0.185792 -0.714371 -0.607652 -0.070977 0.103325 ... -0.704180 -1.008595 -0.426895 -0.018947 0.567454 0.483050 -0.180623 -0.287286 -0.556010 0.511452
[CLS] -0.118897 -0.518255 0.159338 -0.461482 -0.003488 -0.453042 -0.212884 -0.229699 -0.063944 -0.421272 ... 0.242429 0.009379 0.467546 -0.957577 0.114305 -0.369990 0.035248 0.089144 -0.146707 -0.127492
accomplish -0.085958 0.289747 0.502433 -0.699373 -0.112547 0.115439 1.011047 -0.570650 -0.124616 -0.372738 ... -1.153663 -0.272904 0.580479 0.201596 -0.194586 -0.872371 -0.358961 0.039148 0.101910 -0.169740
acts -0.178121 0.666130 0.413916 0.099232 -1.034455 -0.040334 -0.048890 -0.098929 -0.145200 0.480938 ... -1.342113 0.323047 0.763901 -1.770176 0.986050 -0.538208 -0.744755 0.057135 -0.445161 -0.774442
aid -0.427848 0.147286 0.013369 0.471762 0.168422 0.895970 0.872069 1.716045 0.837368 0.494484 ... -0.787848 -0.445969 0.358512 0.965379 0.365942 0.054571 -0.555102 -0.860554 -0.171662 -1.067538
aims 1.024410 -0.256186 -0.053223 0.638275 -0.077131 -0.311400 -0.234701 -0.026557 0.915609 -0.596711 ... -0.522890 1.130598 -0.809434 -0.718155 -0.150897 -0.814818 -0.124106 -1.064612 -0.574141 -0.703143
all -0.002165 0.033546 0.609011 -0.239792 -0.932221 -1.037787 0.136487 -0.882456 0.342895 -0.798731 ... 0.239174 -0.662943 0.205603 0.161089 -0.176249 -0.105027 0.036648 0.276853 0.527183 -0.216994
and 0.685023 0.240559 -0.555069 -0.011118 0.690408 0.024725 -0.583992 -0.181338 -0.816213 0.622959 ... -0.739703 0.255513 0.252658 -0.401268 0.020371 0.737326 0.633780 -0.291272 -0.483794 -0.894787
areas 0.720950 -0.746471 0.160797 -0.725814 1.154829 -0.101660 -0.124824 -0.682128 -0.094830 0.434859 ... -0.126514 0.163580 0.774643 0.072612 0.070189 0.888373 0.193730 0.033260 0.078655 0.414108
as -0.917634 0.064711 -0.368008 0.218496 1.246304 -0.059035 -1.098246 0.450588 -1.106012 -1.061205 ... -0.457125 -0.102798 0.220228 -0.558394 -0.207785 0.430128 -1.262651 0.324153 -0.113458 0.350910
assistance -0.678422 0.172338 0.265142 0.258749 -0.105260 -0.331368 -0.350325 0.263004 -0.668104 -0.555148 ... -0.475693 0.409995 0.346237 0.555776 0.440133 -0.523475 0.134694 0.275381 -0.253818 0.210988
coincide -0.170578 -0.268886 -0.115447 -0.694355 0.483255 -0.280104 -0.640158 0.034518 -0.001312 0.433044 ... -0.380226 -0.208592 -0.748574 -0.197001 0.404674 -0.318124 -0.521167 0.667394 -0.002012 1.150480
collective 0.151802 0.762978 -0.005754 0.526812 0.764982 0.812513 -0.570761 -0.021987 -1.306559 -0.586478 ... -0.281096 -1.654515 -0.409773 -0.740668 0.533563 -0.904656 0.133838 -0.766242 0.390741 -0.584280
conscience 0.158133 -0.274482 -0.492211 -0.170320 0.652437 -0.445447 0.168138 -0.694122 -0.364806 -1.202534 ... -0.850401 -0.854405 0.362089 0.396198 0.172970 -0.543768 -1.138113 0.421941 1.142612 1.083518
cultural -1.082594 -0.155604 0.376953 -0.332056 -0.495655 -1.082139 0.782310 -0.506714 0.033708 0.338000 ... 0.613047 0.565934 -1.216563 0.057687 -0.932211 -0.200133 -0.734939 0.179817 -0.215364 -0.216755
developed -0.195352 0.150783 0.080199 -0.162700 0.360601 0.163544 -0.581970 0.052141 0.185778 -0.233883 ... -1.082397 0.500910 0.256611 -0.607807 -0.145842 0.529889 0.446537 0.195365 -0.643610 -0.130024
development 0.298387 0.496667 -0.434238 0.093009 -1.293615 0.209092 -0.368899 0.919280 0.138753 -0.650476 ... -0.645376 1.387810 0.642955 0.327757 -0.424770 -0.255853 0.202628 0.501966 -0.376638 -0.267928
duties -0.651811 -0.242606 0.490167 -0.137662 0.505857 -0.002086 1.298450 0.396070 1.254041 -0.024695 ... 0.302751 -0.295479 -0.401752 -0.347428 -0.645984 -0.070883 -1.304861 -0.363276 -0.646936 -1.105535
economic 0.327582 0.387518 -0.703620 0.579996 0.298967 0.924665 0.373069 -0.637522 0.973225 0.572886 ... 0.214301 0.598506 0.876789 0.094422 0.311424 -0.415416 -0.762125 0.453513 0.134451 -0.050874
ends -0.546396 -0.359361 0.212410 0.277001 0.106774 0.592411 -0.247585 -0.792451 -0.884770 0.545342 ... -0.271307 -0.330248 0.591369 0.783333 1.241485 -0.296724 0.078918 0.412234 0.387542 0.192228
for -0.253029 -0.404490 -0.261728 0.038780 0.154741 0.588547 -1.170278 -0.019293 0.076067 -1.264595 ... -0.787064 -0.461386 -0.638114 -0.482141 -0.344258 -0.054569 0.378593 -0.465482 -0.611659 0.433159
forward -0.207178 0.236094 -0.957996 -0.357690 0.334218 -0.413327 -0.956618 -0.011278 -0.408511 -0.037497 ... -1.214147 -0.303270 0.135196 0.555519 0.122476 0.240908 -0.056521 0.050130 -0.211742 0.378766
guide -1.178150 0.731618 -0.593804 0.415418 1.651968 -0.073628 0.057820 -0.396742 -0.996873 0.429075 ... 0.614620 -0.451440 -1.288784 -0.389848 0.017629 -0.483427 -0.484336 0.149367 0.323914 -1.239273
happily -0.552708 0.431273 -0.276702 -0.626600 1.248792 -0.940124 -0.367457 -0.736995 0.878078 0.467198 ... 0.395383 -0.414090 -0.134464 -0.228733 -0.090979 0.121471 0.091699 -0.064050 -0.222706 0.177931
in -0.825479 -1.051559 0.758063 0.673075 0.240419 0.115155 -0.920780 0.137270 1.564308 -0.329897 ... -0.943635 0.426585 -0.385542 -0.110009 -1.135049 -0.032381 -0.399695 0.427312 0.118642 0.217453
interest 0.087404 -0.640190 -0.453728 -0.850574 -0.157392 -0.533750 0.637173 -0.608730 -0.633894 0.933405 ... 0.899409 1.313719 0.314493 0.620090 -0.622023 0.070112 -1.821788 -0.131984 -0.347964 -0.195112
its -0.217026 0.894963 -0.105397 -0.401083 -0.738265 -0.480244 -0.346089 -0.352199 1.088866 -0.914263 ... -0.941486 0.522719 0.290851 0.399336 -0.462239 -0.174071 -0.092375 0.078811 0.854677 -0.169473
moral 0.619293 0.499217 0.249499 -0.751159 0.466876 -0.062412 -0.184458 -0.660417 0.652479 -1.639679 ... -0.382880 0.017941 0.048614 -0.491065 0.557178 1.361697 0.181574 -0.451610 0.301997 -0.267717
must -0.157701 0.589252 0.273769 0.857760 0.567033 0.108004 -1.062661 -1.089537 -0.240025 -0.304565 ... 0.769697 -0.041358 -0.410516 0.544890 -0.466517 -1.122368 0.304115 0.411068 0.301097 0.433811
objectives -0.875071 0.340940 -0.162661 0.404240 0.205024 -0.591867 0.305278 0.519380 0.144190 -0.521769 ... -0.237458 -0.047777 0.328942 -1.136463 0.665098 0.725972 0.299084 -0.483213 -0.158996 -0.867711
of -0.327949 -0.544503 -0.092067 -1.195621 0.039030 -1.281222 -0.414859 0.070955 -0.199553 -1.428178 ... -0.035053 0.183135 0.080120 -0.322540 -0.468436 0.623744 0.315495 0.084182 0.077692 0.203555
order -0.067636 -1.451491 0.146754 -0.419711 0.023377 -0.810850 -0.734983 -0.020393 -0.560763 0.204884 ... -0.109579 0.385012 -0.002484 -0.126181 -0.709134 0.109215 1.085052 0.454821 0.048261 0.162490
peoples -0.221793 -0.247048 -0.341812 0.627216 1.513762 -0.743124 -0.456467 -1.155934 0.653813 -0.737646 ... 0.766392 -0.728573 -0.624886 0.481552 0.536343 -0.368652 0.979576 0.851340 -0.597788 0.720032
phases -0.914526 0.031760 -0.057205 -0.529099 0.335894 -0.689959 -0.019860 0.153372 0.436006 -0.038952 ... -0.668004 0.173632 -0.416013 0.557431 -0.360914 -0.490024 -0.671597 0.189420 0.384801 -0.988437
practical 0.497227 0.944226 -0.767903 -0.117738 -0.066879 0.199990 0.709391 0.005245 0.146205 -0.415131 ... 0.073728 -0.115236 -0.118528 0.329659 -0.090454 0.459422 -1.281357 -0.437196 -0.268200 0.232447
push -0.184951 0.711796 0.721361 -0.777537 0.321118 0.102113 -0.503164 0.589479 1.586533 0.306067 ... -0.007689 -0.354350 -0.783757 0.479150 0.510954 0.808599 -1.763679 1.179172 -0.048447 -0.237922
realization 0.680622 0.343941 0.359005 -0.110402 0.846315 -1.126212 0.410226 -0.506499 -0.529498 -1.226504 ... 0.534820 0.661075 0.972976 0.132633 0.495696 -0.392315 0.310346 0.728478 -0.813280 0.737264
self 0.257648 0.058829 -0.417947 0.250967 0.102127 -0.601216 0.257821 -0.101740 -0.363018 -0.105056 ... 1.385324 -0.540304 -0.544518 0.790096 0.714771 0.062684 -0.041591 0.477603 0.045479 0.348599
social 0.001411 0.083508 -0.384193 0.038151 0.828193 -0.295133 -0.131154 -0.310905 -0.194347 -0.355901 ... -0.688219 -0.004625 -0.023948 0.265741 -0.029004 -0.086471 0.103009 1.135597 -0.636987 -0.338693
the 1.259149 -0.390939 -0.574276 -0.651518 0.975174 -0.346215 -0.203062 0.069047 -0.972633 -0.484539 ... -0.177573 0.605176 -0.330061 -0.963901 0.492528 0.279568 -0.408966 0.528585 -1.137273 0.370578
their -0.141068 -0.385690 -0.610704 -0.631536 0.036775 -0.008408 0.058548 0.447865 -0.403555 -0.706317 ... -0.900095 0.260387 -0.259527 -0.011186 0.366235 -0.750171 0.071075 0.316691 0.273126 -1.098873
these -0.867060 -0.423800 -0.462822 0.031337 -0.921427 0.606853 -0.407772 0.609299 -0.070913 0.411936 ... -0.375234 -0.172975 0.506652 -0.142546 0.120540 -1.079829 -0.097479 -0.512669 -0.959007 0.081844
through -0.806322 0.435404 -0.860480 -0.352367 -0.911619 0.371939 -0.302082 -0.138660 -0.286805 0.741813 ... 0.205986 0.712598 -0.843359 0.651225 0.107572 0.096148 -0.935132 0.180383 0.055736 -0.279496
to -0.467868 -0.124578 -0.321987 0.056667 -0.164021 1.174730 -0.633803 0.867373 -1.580327 -0.784350 ... -0.367765 -0.081297 0.588999 0.236422 0.058517 -0.139931 0.747637 -0.138854 -0.143276 -0.009610
under 0.687285 -0.849716 0.345514 -0.015210 -0.001491 0.095282 -0.069619 -0.023330 -0.572866 -1.167167 ... -0.920049 0.214902 0.161194 -0.373119 -0.300902 -0.546927 0.374067 0.461742 0.977844 -0.106500
we 0.493856 -0.277951 1.282449 -0.818842 -0.315630 1.576497 -0.768214 -1.161283 -1.137017 0.787399 ... 0.117417 -0.324164 -0.510348 0.801145 -0.253922 0.568123 0.851370 -0.558899 -0.064633 -0.407102
which -0.368462 -0.361856 0.409696 0.587431 0.708897 0.291840 -0.362036 -0.530712 -0.053614 -0.711788 ... -1.084328 -0.832080 0.269114 0.851772 -0.942639 -0.081171 -1.273354 -0.796438 -0.260538 0.355565
with -0.388580 -0.219293 -0.652044 -1.386387 -0.089682 -0.403928 -1.394354 0.763693 -0.379815 -0.662670 ... -0.599527 -0.076366 0.786938 -1.619844 -0.512546 -0.580194 -0.000503 -0.181073 -1.548287 0.417536
world 0.737164 -0.141081 0.328124 0.615530 0.357371 0.274039 -0.563317 0.040565 -0.111780 0.075683 ... -0.404257 0.543159 0.008046 0.476506 0.046138 -0.633671 -0.073570 -0.446745 -0.119588 -0.238425

56 rows × 768 columns