from transformers import BertModel, BertTokenizer, AutoTokenizer
import numpy as np
import streamlit as st
import re
import pandas as pd
from datetime import datetime
import nltk
import torchBERT Functions
BERT Functions
I define two functions to run Bert models. First part processes a single text document into a format that is recognizable by BERT. The second part uses the tokenized text to generate embedding values using pre-trained BERT models.
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')#input is "light.csv" which does not include stop words.
df = pd.read_csv('../../../data/processed/paragraph.csv')
# Filter
timestamps = df.year.to_list()
texts = df.text.to_list()
text = texts[1]df.head(1)| Unnamed: 0 | ccode_iso | session | year | paragraph_index | text | |
|---|---|---|---|---|---|---|
| 0 | 1 | AFG | 7 | 1952 | 1 | I consider it a great honour and privilege to ... |
print(type(text))<class 'str'>
Define functions
def bert_preprocess(text):
"""
Preprocesses a document into a BERT-recognizable format
Input: text in a string format
output: three objects ready to be used for Bert modeling
marked_text (list)
indexed_tokens(list)
attention_mask(list)
"""
# Tokenize the text
tokenized_text = tokenizer.tokenize(text)
truncate_length = len(tokenized_text) - 512 + 2 # +2 to account for [CLS] and [SEP]
# Truncate the beginning and end of the text
truncated_text = tokenized_text[truncate_length//2 : -truncate_length//2]
# Add padding
# Add special tokens [CLS] and [SEP], convert tokens to ids, and create attention mask
marked_text = ["[CLS] "] + truncated_text + [" [SEP]"]
indexed_tokens = tokenizer.convert_tokens_to_ids(marked_text)
attention_mask = [1] * len(indexed_tokens)
# Pad sequences to max_seq_length
if len(indexed_tokens) < 512:
indexed_tokens.append(0)
attention_mask.append(0)
return marked_text, indexed_tokens, attention_maskmarked_text, indexed_tokens, attention_mask = bert_preprocess(text)help(get_bert_embeddings)Help on function get_bert_embeddings in module __main__:
get_bert_embeddings(marked_text, indexed_tokens, attention_mask)
input: processed text
output: dataframe of embedding weights for each token
ex) dimension of 512*768 where row represents token, column represents bert features
def get_bert_embeddings(marked_text, indexed_tokens, attention_mask):
"""
Generates embedding values for tokenized text
input: processed text, indexed_tokens and attention mask (all in list format)
output: dataframe of embedding weights for each token
ex) dimension of 512*768 where row represents token, column represents bert features
"""
# Convert lists to PyTorch tensors
tokens_tensors = torch.tensor([indexed_tokens])
attention_masks = torch.tensor([attention_mask])
with torch.no_grad():
#Run the embedding
outputs = model(input_ids=tokens_tensors.view(-1, tokens_tensors.size(-1)),
attention_mask=attention_masks.view(-1, attention_masks.size(-1)))
# Extract the hidden states
hidden_states = outputs[2][0].squeeze().numpy()
# Convert to data frame
pd_words = pd.Series(marked_text, name='term')
df_outputs = pd.DataFrame(hidden_states)
df_outputs['term'] = pd_words
# Move 'term' column to the first position
df_outputs = df_outputs[['term'] + [col for col in df_outputs.columns if col != 'term']]
# Remove duplicate tokens by averaging them out
df_outputs_embedding = df_outputs.groupby(['term']).mean()
return df_outputs_embeddingget_bert_embeddings(marked_text, indexed_tokens, attention_mask)| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term | |||||||||||||||||||||
| [SEP] | -0.599731 | -0.287527 | 0.995737 | -0.067600 | -0.116662 | -0.319243 | -0.035646 | -0.550722 | -0.154269 | 0.226906 | ... | 0.433558 | 0.360281 | 0.753348 | -1.155800 | 0.198939 | 0.126193 | 0.058285 | 0.035218 | -0.225301 | -0.376395 |
| ##s | -0.082928 | 0.064610 | 0.062934 | 1.201868 | 0.416490 | -0.351008 | -0.419693 | 0.793464 | -0.682201 | -0.435875 | ... | -1.640653 | -0.082774 | 1.440754 | 0.477181 | 0.555801 | 0.517778 | 0.029644 | 0.167330 | -0.804072 | 1.100950 |
| , | 0.238827 | -0.499530 | -0.229385 | -0.420359 | 0.382101 | -0.133325 | -0.423249 | -0.133761 | 0.079275 | -0.810453 | ... | -0.476354 | 0.310680 | -0.071447 | -0.350534 | -0.166876 | -0.152760 | 0.157087 | 0.182910 | -0.305537 | 0.119838 |
| - | 0.211686 | -0.337158 | -0.282966 | -0.379349 | 0.213550 | -0.254544 | -0.361127 | -0.094978 | 0.072562 | -0.825429 | ... | -0.457777 | 0.271165 | 0.238502 | -0.122299 | -0.090638 | -0.070707 | 0.017792 | -0.049120 | -0.269647 | 0.226714 |
| . | 0.117108 | -0.388444 | -0.088623 | 0.064858 | 0.523230 | -0.428734 | -0.267266 | -0.420127 | 0.190312 | -0.766927 | ... | -0.490892 | 0.354983 | -0.355035 | -0.418002 | 0.253672 | 0.086620 | 0.108094 | -0.150912 | -0.198612 | 0.161442 |
| In | 1.179606 | 0.055646 | 0.182922 | 1.080442 | 0.191964 | -0.443555 | -0.347383 | 0.757875 | -0.356111 | -1.096234 | ... | 0.154227 | 0.468894 | 0.522242 | -0.500889 | 0.947098 | 1.150616 | -0.767531 | -0.148597 | -0.223548 | 0.134304 |
| Nations | 0.006905 | -0.580956 | 0.575177 | -1.220215 | 0.371888 | -0.279317 | 0.700901 | -0.903072 | 0.631587 | -0.739470 | ... | 0.495864 | 0.132003 | 0.067810 | 0.293461 | 0.424441 | 0.552663 | 0.318386 | -0.620372 | -0.583365 | 0.048380 |
| United | -0.077556 | -0.105724 | 1.057649 | -0.423865 | 0.314821 | -0.185792 | -0.714371 | -0.607652 | -0.070977 | 0.103325 | ... | -0.704180 | -1.008595 | -0.426895 | -0.018947 | 0.567454 | 0.483050 | -0.180623 | -0.287286 | -0.556010 | 0.511452 |
| [CLS] | -0.118897 | -0.518255 | 0.159338 | -0.461482 | -0.003488 | -0.453042 | -0.212884 | -0.229699 | -0.063944 | -0.421272 | ... | 0.242429 | 0.009379 | 0.467546 | -0.957577 | 0.114305 | -0.369990 | 0.035248 | 0.089144 | -0.146707 | -0.127492 |
| accomplish | -0.085958 | 0.289747 | 0.502433 | -0.699373 | -0.112547 | 0.115439 | 1.011047 | -0.570650 | -0.124616 | -0.372738 | ... | -1.153663 | -0.272904 | 0.580479 | 0.201596 | -0.194586 | -0.872371 | -0.358961 | 0.039148 | 0.101910 | -0.169740 |
| acts | -0.178121 | 0.666130 | 0.413916 | 0.099232 | -1.034455 | -0.040334 | -0.048890 | -0.098929 | -0.145200 | 0.480938 | ... | -1.342113 | 0.323047 | 0.763901 | -1.770176 | 0.986050 | -0.538208 | -0.744755 | 0.057135 | -0.445161 | -0.774442 |
| aid | -0.427848 | 0.147286 | 0.013369 | 0.471762 | 0.168422 | 0.895970 | 0.872069 | 1.716045 | 0.837368 | 0.494484 | ... | -0.787848 | -0.445969 | 0.358512 | 0.965379 | 0.365942 | 0.054571 | -0.555102 | -0.860554 | -0.171662 | -1.067538 |
| aims | 1.024410 | -0.256186 | -0.053223 | 0.638275 | -0.077131 | -0.311400 | -0.234701 | -0.026557 | 0.915609 | -0.596711 | ... | -0.522890 | 1.130598 | -0.809434 | -0.718155 | -0.150897 | -0.814818 | -0.124106 | -1.064612 | -0.574141 | -0.703143 |
| all | -0.002165 | 0.033546 | 0.609011 | -0.239792 | -0.932221 | -1.037787 | 0.136487 | -0.882456 | 0.342895 | -0.798731 | ... | 0.239174 | -0.662943 | 0.205603 | 0.161089 | -0.176249 | -0.105027 | 0.036648 | 0.276853 | 0.527183 | -0.216994 |
| and | 0.685023 | 0.240559 | -0.555069 | -0.011118 | 0.690408 | 0.024725 | -0.583992 | -0.181338 | -0.816213 | 0.622959 | ... | -0.739703 | 0.255513 | 0.252658 | -0.401268 | 0.020371 | 0.737326 | 0.633780 | -0.291272 | -0.483794 | -0.894787 |
| areas | 0.720950 | -0.746471 | 0.160797 | -0.725814 | 1.154829 | -0.101660 | -0.124824 | -0.682128 | -0.094830 | 0.434859 | ... | -0.126514 | 0.163580 | 0.774643 | 0.072612 | 0.070189 | 0.888373 | 0.193730 | 0.033260 | 0.078655 | 0.414108 |
| as | -0.917634 | 0.064711 | -0.368008 | 0.218496 | 1.246304 | -0.059035 | -1.098246 | 0.450588 | -1.106012 | -1.061205 | ... | -0.457125 | -0.102798 | 0.220228 | -0.558394 | -0.207785 | 0.430128 | -1.262651 | 0.324153 | -0.113458 | 0.350910 |
| assistance | -0.678422 | 0.172338 | 0.265142 | 0.258749 | -0.105260 | -0.331368 | -0.350325 | 0.263004 | -0.668104 | -0.555148 | ... | -0.475693 | 0.409995 | 0.346237 | 0.555776 | 0.440133 | -0.523475 | 0.134694 | 0.275381 | -0.253818 | 0.210988 |
| coincide | -0.170578 | -0.268886 | -0.115447 | -0.694355 | 0.483255 | -0.280104 | -0.640158 | 0.034518 | -0.001312 | 0.433044 | ... | -0.380226 | -0.208592 | -0.748574 | -0.197001 | 0.404674 | -0.318124 | -0.521167 | 0.667394 | -0.002012 | 1.150480 |
| collective | 0.151802 | 0.762978 | -0.005754 | 0.526812 | 0.764982 | 0.812513 | -0.570761 | -0.021987 | -1.306559 | -0.586478 | ... | -0.281096 | -1.654515 | -0.409773 | -0.740668 | 0.533563 | -0.904656 | 0.133838 | -0.766242 | 0.390741 | -0.584280 |
| conscience | 0.158133 | -0.274482 | -0.492211 | -0.170320 | 0.652437 | -0.445447 | 0.168138 | -0.694122 | -0.364806 | -1.202534 | ... | -0.850401 | -0.854405 | 0.362089 | 0.396198 | 0.172970 | -0.543768 | -1.138113 | 0.421941 | 1.142612 | 1.083518 |
| cultural | -1.082594 | -0.155604 | 0.376953 | -0.332056 | -0.495655 | -1.082139 | 0.782310 | -0.506714 | 0.033708 | 0.338000 | ... | 0.613047 | 0.565934 | -1.216563 | 0.057687 | -0.932211 | -0.200133 | -0.734939 | 0.179817 | -0.215364 | -0.216755 |
| developed | -0.195352 | 0.150783 | 0.080199 | -0.162700 | 0.360601 | 0.163544 | -0.581970 | 0.052141 | 0.185778 | -0.233883 | ... | -1.082397 | 0.500910 | 0.256611 | -0.607807 | -0.145842 | 0.529889 | 0.446537 | 0.195365 | -0.643610 | -0.130024 |
| development | 0.298387 | 0.496667 | -0.434238 | 0.093009 | -1.293615 | 0.209092 | -0.368899 | 0.919280 | 0.138753 | -0.650476 | ... | -0.645376 | 1.387810 | 0.642955 | 0.327757 | -0.424770 | -0.255853 | 0.202628 | 0.501966 | -0.376638 | -0.267928 |
| duties | -0.651811 | -0.242606 | 0.490167 | -0.137662 | 0.505857 | -0.002086 | 1.298450 | 0.396070 | 1.254041 | -0.024695 | ... | 0.302751 | -0.295479 | -0.401752 | -0.347428 | -0.645984 | -0.070883 | -1.304861 | -0.363276 | -0.646936 | -1.105535 |
| economic | 0.327582 | 0.387518 | -0.703620 | 0.579996 | 0.298967 | 0.924665 | 0.373069 | -0.637522 | 0.973225 | 0.572886 | ... | 0.214301 | 0.598506 | 0.876789 | 0.094422 | 0.311424 | -0.415416 | -0.762125 | 0.453513 | 0.134451 | -0.050874 |
| ends | -0.546396 | -0.359361 | 0.212410 | 0.277001 | 0.106774 | 0.592411 | -0.247585 | -0.792451 | -0.884770 | 0.545342 | ... | -0.271307 | -0.330248 | 0.591369 | 0.783333 | 1.241485 | -0.296724 | 0.078918 | 0.412234 | 0.387542 | 0.192228 |
| for | -0.253029 | -0.404490 | -0.261728 | 0.038780 | 0.154741 | 0.588547 | -1.170278 | -0.019293 | 0.076067 | -1.264595 | ... | -0.787064 | -0.461386 | -0.638114 | -0.482141 | -0.344258 | -0.054569 | 0.378593 | -0.465482 | -0.611659 | 0.433159 |
| forward | -0.207178 | 0.236094 | -0.957996 | -0.357690 | 0.334218 | -0.413327 | -0.956618 | -0.011278 | -0.408511 | -0.037497 | ... | -1.214147 | -0.303270 | 0.135196 | 0.555519 | 0.122476 | 0.240908 | -0.056521 | 0.050130 | -0.211742 | 0.378766 |
| guide | -1.178150 | 0.731618 | -0.593804 | 0.415418 | 1.651968 | -0.073628 | 0.057820 | -0.396742 | -0.996873 | 0.429075 | ... | 0.614620 | -0.451440 | -1.288784 | -0.389848 | 0.017629 | -0.483427 | -0.484336 | 0.149367 | 0.323914 | -1.239273 |
| happily | -0.552708 | 0.431273 | -0.276702 | -0.626600 | 1.248792 | -0.940124 | -0.367457 | -0.736995 | 0.878078 | 0.467198 | ... | 0.395383 | -0.414090 | -0.134464 | -0.228733 | -0.090979 | 0.121471 | 0.091699 | -0.064050 | -0.222706 | 0.177931 |
| in | -0.825479 | -1.051559 | 0.758063 | 0.673075 | 0.240419 | 0.115155 | -0.920780 | 0.137270 | 1.564308 | -0.329897 | ... | -0.943635 | 0.426585 | -0.385542 | -0.110009 | -1.135049 | -0.032381 | -0.399695 | 0.427312 | 0.118642 | 0.217453 |
| interest | 0.087404 | -0.640190 | -0.453728 | -0.850574 | -0.157392 | -0.533750 | 0.637173 | -0.608730 | -0.633894 | 0.933405 | ... | 0.899409 | 1.313719 | 0.314493 | 0.620090 | -0.622023 | 0.070112 | -1.821788 | -0.131984 | -0.347964 | -0.195112 |
| its | -0.217026 | 0.894963 | -0.105397 | -0.401083 | -0.738265 | -0.480244 | -0.346089 | -0.352199 | 1.088866 | -0.914263 | ... | -0.941486 | 0.522719 | 0.290851 | 0.399336 | -0.462239 | -0.174071 | -0.092375 | 0.078811 | 0.854677 | -0.169473 |
| moral | 0.619293 | 0.499217 | 0.249499 | -0.751159 | 0.466876 | -0.062412 | -0.184458 | -0.660417 | 0.652479 | -1.639679 | ... | -0.382880 | 0.017941 | 0.048614 | -0.491065 | 0.557178 | 1.361697 | 0.181574 | -0.451610 | 0.301997 | -0.267717 |
| must | -0.157701 | 0.589252 | 0.273769 | 0.857760 | 0.567033 | 0.108004 | -1.062661 | -1.089537 | -0.240025 | -0.304565 | ... | 0.769697 | -0.041358 | -0.410516 | 0.544890 | -0.466517 | -1.122368 | 0.304115 | 0.411068 | 0.301097 | 0.433811 |
| objectives | -0.875071 | 0.340940 | -0.162661 | 0.404240 | 0.205024 | -0.591867 | 0.305278 | 0.519380 | 0.144190 | -0.521769 | ... | -0.237458 | -0.047777 | 0.328942 | -1.136463 | 0.665098 | 0.725972 | 0.299084 | -0.483213 | -0.158996 | -0.867711 |
| of | -0.327949 | -0.544503 | -0.092067 | -1.195621 | 0.039030 | -1.281222 | -0.414859 | 0.070955 | -0.199553 | -1.428178 | ... | -0.035053 | 0.183135 | 0.080120 | -0.322540 | -0.468436 | 0.623744 | 0.315495 | 0.084182 | 0.077692 | 0.203555 |
| order | -0.067636 | -1.451491 | 0.146754 | -0.419711 | 0.023377 | -0.810850 | -0.734983 | -0.020393 | -0.560763 | 0.204884 | ... | -0.109579 | 0.385012 | -0.002484 | -0.126181 | -0.709134 | 0.109215 | 1.085052 | 0.454821 | 0.048261 | 0.162490 |
| peoples | -0.221793 | -0.247048 | -0.341812 | 0.627216 | 1.513762 | -0.743124 | -0.456467 | -1.155934 | 0.653813 | -0.737646 | ... | 0.766392 | -0.728573 | -0.624886 | 0.481552 | 0.536343 | -0.368652 | 0.979576 | 0.851340 | -0.597788 | 0.720032 |
| phases | -0.914526 | 0.031760 | -0.057205 | -0.529099 | 0.335894 | -0.689959 | -0.019860 | 0.153372 | 0.436006 | -0.038952 | ... | -0.668004 | 0.173632 | -0.416013 | 0.557431 | -0.360914 | -0.490024 | -0.671597 | 0.189420 | 0.384801 | -0.988437 |
| practical | 0.497227 | 0.944226 | -0.767903 | -0.117738 | -0.066879 | 0.199990 | 0.709391 | 0.005245 | 0.146205 | -0.415131 | ... | 0.073728 | -0.115236 | -0.118528 | 0.329659 | -0.090454 | 0.459422 | -1.281357 | -0.437196 | -0.268200 | 0.232447 |
| push | -0.184951 | 0.711796 | 0.721361 | -0.777537 | 0.321118 | 0.102113 | -0.503164 | 0.589479 | 1.586533 | 0.306067 | ... | -0.007689 | -0.354350 | -0.783757 | 0.479150 | 0.510954 | 0.808599 | -1.763679 | 1.179172 | -0.048447 | -0.237922 |
| realization | 0.680622 | 0.343941 | 0.359005 | -0.110402 | 0.846315 | -1.126212 | 0.410226 | -0.506499 | -0.529498 | -1.226504 | ... | 0.534820 | 0.661075 | 0.972976 | 0.132633 | 0.495696 | -0.392315 | 0.310346 | 0.728478 | -0.813280 | 0.737264 |
| self | 0.257648 | 0.058829 | -0.417947 | 0.250967 | 0.102127 | -0.601216 | 0.257821 | -0.101740 | -0.363018 | -0.105056 | ... | 1.385324 | -0.540304 | -0.544518 | 0.790096 | 0.714771 | 0.062684 | -0.041591 | 0.477603 | 0.045479 | 0.348599 |
| social | 0.001411 | 0.083508 | -0.384193 | 0.038151 | 0.828193 | -0.295133 | -0.131154 | -0.310905 | -0.194347 | -0.355901 | ... | -0.688219 | -0.004625 | -0.023948 | 0.265741 | -0.029004 | -0.086471 | 0.103009 | 1.135597 | -0.636987 | -0.338693 |
| the | 1.259149 | -0.390939 | -0.574276 | -0.651518 | 0.975174 | -0.346215 | -0.203062 | 0.069047 | -0.972633 | -0.484539 | ... | -0.177573 | 0.605176 | -0.330061 | -0.963901 | 0.492528 | 0.279568 | -0.408966 | 0.528585 | -1.137273 | 0.370578 |
| their | -0.141068 | -0.385690 | -0.610704 | -0.631536 | 0.036775 | -0.008408 | 0.058548 | 0.447865 | -0.403555 | -0.706317 | ... | -0.900095 | 0.260387 | -0.259527 | -0.011186 | 0.366235 | -0.750171 | 0.071075 | 0.316691 | 0.273126 | -1.098873 |
| these | -0.867060 | -0.423800 | -0.462822 | 0.031337 | -0.921427 | 0.606853 | -0.407772 | 0.609299 | -0.070913 | 0.411936 | ... | -0.375234 | -0.172975 | 0.506652 | -0.142546 | 0.120540 | -1.079829 | -0.097479 | -0.512669 | -0.959007 | 0.081844 |
| through | -0.806322 | 0.435404 | -0.860480 | -0.352367 | -0.911619 | 0.371939 | -0.302082 | -0.138660 | -0.286805 | 0.741813 | ... | 0.205986 | 0.712598 | -0.843359 | 0.651225 | 0.107572 | 0.096148 | -0.935132 | 0.180383 | 0.055736 | -0.279496 |
| to | -0.467868 | -0.124578 | -0.321987 | 0.056667 | -0.164021 | 1.174730 | -0.633803 | 0.867373 | -1.580327 | -0.784350 | ... | -0.367765 | -0.081297 | 0.588999 | 0.236422 | 0.058517 | -0.139931 | 0.747637 | -0.138854 | -0.143276 | -0.009610 |
| under | 0.687285 | -0.849716 | 0.345514 | -0.015210 | -0.001491 | 0.095282 | -0.069619 | -0.023330 | -0.572866 | -1.167167 | ... | -0.920049 | 0.214902 | 0.161194 | -0.373119 | -0.300902 | -0.546927 | 0.374067 | 0.461742 | 0.977844 | -0.106500 |
| we | 0.493856 | -0.277951 | 1.282449 | -0.818842 | -0.315630 | 1.576497 | -0.768214 | -1.161283 | -1.137017 | 0.787399 | ... | 0.117417 | -0.324164 | -0.510348 | 0.801145 | -0.253922 | 0.568123 | 0.851370 | -0.558899 | -0.064633 | -0.407102 |
| which | -0.368462 | -0.361856 | 0.409696 | 0.587431 | 0.708897 | 0.291840 | -0.362036 | -0.530712 | -0.053614 | -0.711788 | ... | -1.084328 | -0.832080 | 0.269114 | 0.851772 | -0.942639 | -0.081171 | -1.273354 | -0.796438 | -0.260538 | 0.355565 |
| with | -0.388580 | -0.219293 | -0.652044 | -1.386387 | -0.089682 | -0.403928 | -1.394354 | 0.763693 | -0.379815 | -0.662670 | ... | -0.599527 | -0.076366 | 0.786938 | -1.619844 | -0.512546 | -0.580194 | -0.000503 | -0.181073 | -1.548287 | 0.417536 |
| world | 0.737164 | -0.141081 | 0.328124 | 0.615530 | 0.357371 | 0.274039 | -0.563317 | 0.040565 | -0.111780 | 0.075683 | ... | -0.404257 | 0.543159 | 0.008046 | 0.476506 | 0.046138 | -0.633671 | -0.073570 | -0.446745 | -0.119588 | -0.238425 |
56 rows × 768 columns