from transformers import BertModel, BertTokenizer, AutoTokenizer
import numpy as np
import streamlit as st
import re
import pandas as pd
from datetime import datetime
import nltk
import torch
BERT Functions
BERT Functions
I define two functions to run Bert models. First part processes a single text document into a format that is recognizable by BERT. The second part uses the tokenized text to generate embedding values using pre-trained BERT models.
= BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = AutoTokenizer.from_pretrained('bert-base-cased') tokenizer
#input is "light.csv" which does not include stop words.
= pd.read_csv('../../../data/processed/paragraph.csv')
df # Filter
= df.year.to_list()
timestamps = df.text.to_list()
texts = texts[1] text
1) df.head(
Unnamed: 0 | ccode_iso | session | year | paragraph_index | text | |
---|---|---|---|---|---|---|
0 | 1 | AFG | 7 | 1952 | 1 | I consider it a great honour and privilege to ... |
print(type(text))
<class 'str'>
Define functions
def bert_preprocess(text):
"""
Preprocesses a document into a BERT-recognizable format
Input: text in a string format
output: three objects ready to be used for Bert modeling
marked_text (list)
indexed_tokens(list)
attention_mask(list)
"""
# Tokenize the text
= tokenizer.tokenize(text)
tokenized_text = len(tokenized_text) - 512 + 2 # +2 to account for [CLS] and [SEP]
truncate_length
# Truncate the beginning and end of the text
= tokenized_text[truncate_length//2 : -truncate_length//2]
truncated_text
# Add padding
# Add special tokens [CLS] and [SEP], convert tokens to ids, and create attention mask
= ["[CLS] "] + truncated_text + [" [SEP]"]
marked_text = tokenizer.convert_tokens_to_ids(marked_text)
indexed_tokens = [1] * len(indexed_tokens)
attention_mask
# Pad sequences to max_seq_length
if len(indexed_tokens) < 512:
0)
indexed_tokens.append(0)
attention_mask.append(
return marked_text, indexed_tokens, attention_mask
= bert_preprocess(text) marked_text, indexed_tokens, attention_mask
help(get_bert_embeddings)
Help on function get_bert_embeddings in module __main__:
get_bert_embeddings(marked_text, indexed_tokens, attention_mask)
input: processed text
output: dataframe of embedding weights for each token
ex) dimension of 512*768 where row represents token, column represents bert features
def get_bert_embeddings(marked_text, indexed_tokens, attention_mask):
"""
Generates embedding values for tokenized text
input: processed text, indexed_tokens and attention mask (all in list format)
output: dataframe of embedding weights for each token
ex) dimension of 512*768 where row represents token, column represents bert features
"""
# Convert lists to PyTorch tensors
= torch.tensor([indexed_tokens])
tokens_tensors = torch.tensor([attention_mask])
attention_masks
with torch.no_grad():
#Run the embedding
= model(input_ids=tokens_tensors.view(-1, tokens_tensors.size(-1)),
outputs =attention_masks.view(-1, attention_masks.size(-1)))
attention_mask
# Extract the hidden states
= outputs[2][0].squeeze().numpy()
hidden_states
# Convert to data frame
= pd.Series(marked_text, name='term')
pd_words = pd.DataFrame(hidden_states)
df_outputs 'term'] = pd_words
df_outputs[
# Move 'term' column to the first position
= df_outputs[['term'] + [col for col in df_outputs.columns if col != 'term']]
df_outputs
# Remove duplicate tokens by averaging them out
= df_outputs.groupby(['term']).mean()
df_outputs_embedding return df_outputs_embedding
get_bert_embeddings(marked_text, indexed_tokens, attention_mask)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
term | |||||||||||||||||||||
[SEP] | -0.599731 | -0.287527 | 0.995737 | -0.067600 | -0.116662 | -0.319243 | -0.035646 | -0.550722 | -0.154269 | 0.226906 | ... | 0.433558 | 0.360281 | 0.753348 | -1.155800 | 0.198939 | 0.126193 | 0.058285 | 0.035218 | -0.225301 | -0.376395 |
##s | -0.082928 | 0.064610 | 0.062934 | 1.201868 | 0.416490 | -0.351008 | -0.419693 | 0.793464 | -0.682201 | -0.435875 | ... | -1.640653 | -0.082774 | 1.440754 | 0.477181 | 0.555801 | 0.517778 | 0.029644 | 0.167330 | -0.804072 | 1.100950 |
, | 0.238827 | -0.499530 | -0.229385 | -0.420359 | 0.382101 | -0.133325 | -0.423249 | -0.133761 | 0.079275 | -0.810453 | ... | -0.476354 | 0.310680 | -0.071447 | -0.350534 | -0.166876 | -0.152760 | 0.157087 | 0.182910 | -0.305537 | 0.119838 |
- | 0.211686 | -0.337158 | -0.282966 | -0.379349 | 0.213550 | -0.254544 | -0.361127 | -0.094978 | 0.072562 | -0.825429 | ... | -0.457777 | 0.271165 | 0.238502 | -0.122299 | -0.090638 | -0.070707 | 0.017792 | -0.049120 | -0.269647 | 0.226714 |
. | 0.117108 | -0.388444 | -0.088623 | 0.064858 | 0.523230 | -0.428734 | -0.267266 | -0.420127 | 0.190312 | -0.766927 | ... | -0.490892 | 0.354983 | -0.355035 | -0.418002 | 0.253672 | 0.086620 | 0.108094 | -0.150912 | -0.198612 | 0.161442 |
In | 1.179606 | 0.055646 | 0.182922 | 1.080442 | 0.191964 | -0.443555 | -0.347383 | 0.757875 | -0.356111 | -1.096234 | ... | 0.154227 | 0.468894 | 0.522242 | -0.500889 | 0.947098 | 1.150616 | -0.767531 | -0.148597 | -0.223548 | 0.134304 |
Nations | 0.006905 | -0.580956 | 0.575177 | -1.220215 | 0.371888 | -0.279317 | 0.700901 | -0.903072 | 0.631587 | -0.739470 | ... | 0.495864 | 0.132003 | 0.067810 | 0.293461 | 0.424441 | 0.552663 | 0.318386 | -0.620372 | -0.583365 | 0.048380 |
United | -0.077556 | -0.105724 | 1.057649 | -0.423865 | 0.314821 | -0.185792 | -0.714371 | -0.607652 | -0.070977 | 0.103325 | ... | -0.704180 | -1.008595 | -0.426895 | -0.018947 | 0.567454 | 0.483050 | -0.180623 | -0.287286 | -0.556010 | 0.511452 |
[CLS] | -0.118897 | -0.518255 | 0.159338 | -0.461482 | -0.003488 | -0.453042 | -0.212884 | -0.229699 | -0.063944 | -0.421272 | ... | 0.242429 | 0.009379 | 0.467546 | -0.957577 | 0.114305 | -0.369990 | 0.035248 | 0.089144 | -0.146707 | -0.127492 |
accomplish | -0.085958 | 0.289747 | 0.502433 | -0.699373 | -0.112547 | 0.115439 | 1.011047 | -0.570650 | -0.124616 | -0.372738 | ... | -1.153663 | -0.272904 | 0.580479 | 0.201596 | -0.194586 | -0.872371 | -0.358961 | 0.039148 | 0.101910 | -0.169740 |
acts | -0.178121 | 0.666130 | 0.413916 | 0.099232 | -1.034455 | -0.040334 | -0.048890 | -0.098929 | -0.145200 | 0.480938 | ... | -1.342113 | 0.323047 | 0.763901 | -1.770176 | 0.986050 | -0.538208 | -0.744755 | 0.057135 | -0.445161 | -0.774442 |
aid | -0.427848 | 0.147286 | 0.013369 | 0.471762 | 0.168422 | 0.895970 | 0.872069 | 1.716045 | 0.837368 | 0.494484 | ... | -0.787848 | -0.445969 | 0.358512 | 0.965379 | 0.365942 | 0.054571 | -0.555102 | -0.860554 | -0.171662 | -1.067538 |
aims | 1.024410 | -0.256186 | -0.053223 | 0.638275 | -0.077131 | -0.311400 | -0.234701 | -0.026557 | 0.915609 | -0.596711 | ... | -0.522890 | 1.130598 | -0.809434 | -0.718155 | -0.150897 | -0.814818 | -0.124106 | -1.064612 | -0.574141 | -0.703143 |
all | -0.002165 | 0.033546 | 0.609011 | -0.239792 | -0.932221 | -1.037787 | 0.136487 | -0.882456 | 0.342895 | -0.798731 | ... | 0.239174 | -0.662943 | 0.205603 | 0.161089 | -0.176249 | -0.105027 | 0.036648 | 0.276853 | 0.527183 | -0.216994 |
and | 0.685023 | 0.240559 | -0.555069 | -0.011118 | 0.690408 | 0.024725 | -0.583992 | -0.181338 | -0.816213 | 0.622959 | ... | -0.739703 | 0.255513 | 0.252658 | -0.401268 | 0.020371 | 0.737326 | 0.633780 | -0.291272 | -0.483794 | -0.894787 |
areas | 0.720950 | -0.746471 | 0.160797 | -0.725814 | 1.154829 | -0.101660 | -0.124824 | -0.682128 | -0.094830 | 0.434859 | ... | -0.126514 | 0.163580 | 0.774643 | 0.072612 | 0.070189 | 0.888373 | 0.193730 | 0.033260 | 0.078655 | 0.414108 |
as | -0.917634 | 0.064711 | -0.368008 | 0.218496 | 1.246304 | -0.059035 | -1.098246 | 0.450588 | -1.106012 | -1.061205 | ... | -0.457125 | -0.102798 | 0.220228 | -0.558394 | -0.207785 | 0.430128 | -1.262651 | 0.324153 | -0.113458 | 0.350910 |
assistance | -0.678422 | 0.172338 | 0.265142 | 0.258749 | -0.105260 | -0.331368 | -0.350325 | 0.263004 | -0.668104 | -0.555148 | ... | -0.475693 | 0.409995 | 0.346237 | 0.555776 | 0.440133 | -0.523475 | 0.134694 | 0.275381 | -0.253818 | 0.210988 |
coincide | -0.170578 | -0.268886 | -0.115447 | -0.694355 | 0.483255 | -0.280104 | -0.640158 | 0.034518 | -0.001312 | 0.433044 | ... | -0.380226 | -0.208592 | -0.748574 | -0.197001 | 0.404674 | -0.318124 | -0.521167 | 0.667394 | -0.002012 | 1.150480 |
collective | 0.151802 | 0.762978 | -0.005754 | 0.526812 | 0.764982 | 0.812513 | -0.570761 | -0.021987 | -1.306559 | -0.586478 | ... | -0.281096 | -1.654515 | -0.409773 | -0.740668 | 0.533563 | -0.904656 | 0.133838 | -0.766242 | 0.390741 | -0.584280 |
conscience | 0.158133 | -0.274482 | -0.492211 | -0.170320 | 0.652437 | -0.445447 | 0.168138 | -0.694122 | -0.364806 | -1.202534 | ... | -0.850401 | -0.854405 | 0.362089 | 0.396198 | 0.172970 | -0.543768 | -1.138113 | 0.421941 | 1.142612 | 1.083518 |
cultural | -1.082594 | -0.155604 | 0.376953 | -0.332056 | -0.495655 | -1.082139 | 0.782310 | -0.506714 | 0.033708 | 0.338000 | ... | 0.613047 | 0.565934 | -1.216563 | 0.057687 | -0.932211 | -0.200133 | -0.734939 | 0.179817 | -0.215364 | -0.216755 |
developed | -0.195352 | 0.150783 | 0.080199 | -0.162700 | 0.360601 | 0.163544 | -0.581970 | 0.052141 | 0.185778 | -0.233883 | ... | -1.082397 | 0.500910 | 0.256611 | -0.607807 | -0.145842 | 0.529889 | 0.446537 | 0.195365 | -0.643610 | -0.130024 |
development | 0.298387 | 0.496667 | -0.434238 | 0.093009 | -1.293615 | 0.209092 | -0.368899 | 0.919280 | 0.138753 | -0.650476 | ... | -0.645376 | 1.387810 | 0.642955 | 0.327757 | -0.424770 | -0.255853 | 0.202628 | 0.501966 | -0.376638 | -0.267928 |
duties | -0.651811 | -0.242606 | 0.490167 | -0.137662 | 0.505857 | -0.002086 | 1.298450 | 0.396070 | 1.254041 | -0.024695 | ... | 0.302751 | -0.295479 | -0.401752 | -0.347428 | -0.645984 | -0.070883 | -1.304861 | -0.363276 | -0.646936 | -1.105535 |
economic | 0.327582 | 0.387518 | -0.703620 | 0.579996 | 0.298967 | 0.924665 | 0.373069 | -0.637522 | 0.973225 | 0.572886 | ... | 0.214301 | 0.598506 | 0.876789 | 0.094422 | 0.311424 | -0.415416 | -0.762125 | 0.453513 | 0.134451 | -0.050874 |
ends | -0.546396 | -0.359361 | 0.212410 | 0.277001 | 0.106774 | 0.592411 | -0.247585 | -0.792451 | -0.884770 | 0.545342 | ... | -0.271307 | -0.330248 | 0.591369 | 0.783333 | 1.241485 | -0.296724 | 0.078918 | 0.412234 | 0.387542 | 0.192228 |
for | -0.253029 | -0.404490 | -0.261728 | 0.038780 | 0.154741 | 0.588547 | -1.170278 | -0.019293 | 0.076067 | -1.264595 | ... | -0.787064 | -0.461386 | -0.638114 | -0.482141 | -0.344258 | -0.054569 | 0.378593 | -0.465482 | -0.611659 | 0.433159 |
forward | -0.207178 | 0.236094 | -0.957996 | -0.357690 | 0.334218 | -0.413327 | -0.956618 | -0.011278 | -0.408511 | -0.037497 | ... | -1.214147 | -0.303270 | 0.135196 | 0.555519 | 0.122476 | 0.240908 | -0.056521 | 0.050130 | -0.211742 | 0.378766 |
guide | -1.178150 | 0.731618 | -0.593804 | 0.415418 | 1.651968 | -0.073628 | 0.057820 | -0.396742 | -0.996873 | 0.429075 | ... | 0.614620 | -0.451440 | -1.288784 | -0.389848 | 0.017629 | -0.483427 | -0.484336 | 0.149367 | 0.323914 | -1.239273 |
happily | -0.552708 | 0.431273 | -0.276702 | -0.626600 | 1.248792 | -0.940124 | -0.367457 | -0.736995 | 0.878078 | 0.467198 | ... | 0.395383 | -0.414090 | -0.134464 | -0.228733 | -0.090979 | 0.121471 | 0.091699 | -0.064050 | -0.222706 | 0.177931 |
in | -0.825479 | -1.051559 | 0.758063 | 0.673075 | 0.240419 | 0.115155 | -0.920780 | 0.137270 | 1.564308 | -0.329897 | ... | -0.943635 | 0.426585 | -0.385542 | -0.110009 | -1.135049 | -0.032381 | -0.399695 | 0.427312 | 0.118642 | 0.217453 |
interest | 0.087404 | -0.640190 | -0.453728 | -0.850574 | -0.157392 | -0.533750 | 0.637173 | -0.608730 | -0.633894 | 0.933405 | ... | 0.899409 | 1.313719 | 0.314493 | 0.620090 | -0.622023 | 0.070112 | -1.821788 | -0.131984 | -0.347964 | -0.195112 |
its | -0.217026 | 0.894963 | -0.105397 | -0.401083 | -0.738265 | -0.480244 | -0.346089 | -0.352199 | 1.088866 | -0.914263 | ... | -0.941486 | 0.522719 | 0.290851 | 0.399336 | -0.462239 | -0.174071 | -0.092375 | 0.078811 | 0.854677 | -0.169473 |
moral | 0.619293 | 0.499217 | 0.249499 | -0.751159 | 0.466876 | -0.062412 | -0.184458 | -0.660417 | 0.652479 | -1.639679 | ... | -0.382880 | 0.017941 | 0.048614 | -0.491065 | 0.557178 | 1.361697 | 0.181574 | -0.451610 | 0.301997 | -0.267717 |
must | -0.157701 | 0.589252 | 0.273769 | 0.857760 | 0.567033 | 0.108004 | -1.062661 | -1.089537 | -0.240025 | -0.304565 | ... | 0.769697 | -0.041358 | -0.410516 | 0.544890 | -0.466517 | -1.122368 | 0.304115 | 0.411068 | 0.301097 | 0.433811 |
objectives | -0.875071 | 0.340940 | -0.162661 | 0.404240 | 0.205024 | -0.591867 | 0.305278 | 0.519380 | 0.144190 | -0.521769 | ... | -0.237458 | -0.047777 | 0.328942 | -1.136463 | 0.665098 | 0.725972 | 0.299084 | -0.483213 | -0.158996 | -0.867711 |
of | -0.327949 | -0.544503 | -0.092067 | -1.195621 | 0.039030 | -1.281222 | -0.414859 | 0.070955 | -0.199553 | -1.428178 | ... | -0.035053 | 0.183135 | 0.080120 | -0.322540 | -0.468436 | 0.623744 | 0.315495 | 0.084182 | 0.077692 | 0.203555 |
order | -0.067636 | -1.451491 | 0.146754 | -0.419711 | 0.023377 | -0.810850 | -0.734983 | -0.020393 | -0.560763 | 0.204884 | ... | -0.109579 | 0.385012 | -0.002484 | -0.126181 | -0.709134 | 0.109215 | 1.085052 | 0.454821 | 0.048261 | 0.162490 |
peoples | -0.221793 | -0.247048 | -0.341812 | 0.627216 | 1.513762 | -0.743124 | -0.456467 | -1.155934 | 0.653813 | -0.737646 | ... | 0.766392 | -0.728573 | -0.624886 | 0.481552 | 0.536343 | -0.368652 | 0.979576 | 0.851340 | -0.597788 | 0.720032 |
phases | -0.914526 | 0.031760 | -0.057205 | -0.529099 | 0.335894 | -0.689959 | -0.019860 | 0.153372 | 0.436006 | -0.038952 | ... | -0.668004 | 0.173632 | -0.416013 | 0.557431 | -0.360914 | -0.490024 | -0.671597 | 0.189420 | 0.384801 | -0.988437 |
practical | 0.497227 | 0.944226 | -0.767903 | -0.117738 | -0.066879 | 0.199990 | 0.709391 | 0.005245 | 0.146205 | -0.415131 | ... | 0.073728 | -0.115236 | -0.118528 | 0.329659 | -0.090454 | 0.459422 | -1.281357 | -0.437196 | -0.268200 | 0.232447 |
push | -0.184951 | 0.711796 | 0.721361 | -0.777537 | 0.321118 | 0.102113 | -0.503164 | 0.589479 | 1.586533 | 0.306067 | ... | -0.007689 | -0.354350 | -0.783757 | 0.479150 | 0.510954 | 0.808599 | -1.763679 | 1.179172 | -0.048447 | -0.237922 |
realization | 0.680622 | 0.343941 | 0.359005 | -0.110402 | 0.846315 | -1.126212 | 0.410226 | -0.506499 | -0.529498 | -1.226504 | ... | 0.534820 | 0.661075 | 0.972976 | 0.132633 | 0.495696 | -0.392315 | 0.310346 | 0.728478 | -0.813280 | 0.737264 |
self | 0.257648 | 0.058829 | -0.417947 | 0.250967 | 0.102127 | -0.601216 | 0.257821 | -0.101740 | -0.363018 | -0.105056 | ... | 1.385324 | -0.540304 | -0.544518 | 0.790096 | 0.714771 | 0.062684 | -0.041591 | 0.477603 | 0.045479 | 0.348599 |
social | 0.001411 | 0.083508 | -0.384193 | 0.038151 | 0.828193 | -0.295133 | -0.131154 | -0.310905 | -0.194347 | -0.355901 | ... | -0.688219 | -0.004625 | -0.023948 | 0.265741 | -0.029004 | -0.086471 | 0.103009 | 1.135597 | -0.636987 | -0.338693 |
the | 1.259149 | -0.390939 | -0.574276 | -0.651518 | 0.975174 | -0.346215 | -0.203062 | 0.069047 | -0.972633 | -0.484539 | ... | -0.177573 | 0.605176 | -0.330061 | -0.963901 | 0.492528 | 0.279568 | -0.408966 | 0.528585 | -1.137273 | 0.370578 |
their | -0.141068 | -0.385690 | -0.610704 | -0.631536 | 0.036775 | -0.008408 | 0.058548 | 0.447865 | -0.403555 | -0.706317 | ... | -0.900095 | 0.260387 | -0.259527 | -0.011186 | 0.366235 | -0.750171 | 0.071075 | 0.316691 | 0.273126 | -1.098873 |
these | -0.867060 | -0.423800 | -0.462822 | 0.031337 | -0.921427 | 0.606853 | -0.407772 | 0.609299 | -0.070913 | 0.411936 | ... | -0.375234 | -0.172975 | 0.506652 | -0.142546 | 0.120540 | -1.079829 | -0.097479 | -0.512669 | -0.959007 | 0.081844 |
through | -0.806322 | 0.435404 | -0.860480 | -0.352367 | -0.911619 | 0.371939 | -0.302082 | -0.138660 | -0.286805 | 0.741813 | ... | 0.205986 | 0.712598 | -0.843359 | 0.651225 | 0.107572 | 0.096148 | -0.935132 | 0.180383 | 0.055736 | -0.279496 |
to | -0.467868 | -0.124578 | -0.321987 | 0.056667 | -0.164021 | 1.174730 | -0.633803 | 0.867373 | -1.580327 | -0.784350 | ... | -0.367765 | -0.081297 | 0.588999 | 0.236422 | 0.058517 | -0.139931 | 0.747637 | -0.138854 | -0.143276 | -0.009610 |
under | 0.687285 | -0.849716 | 0.345514 | -0.015210 | -0.001491 | 0.095282 | -0.069619 | -0.023330 | -0.572866 | -1.167167 | ... | -0.920049 | 0.214902 | 0.161194 | -0.373119 | -0.300902 | -0.546927 | 0.374067 | 0.461742 | 0.977844 | -0.106500 |
we | 0.493856 | -0.277951 | 1.282449 | -0.818842 | -0.315630 | 1.576497 | -0.768214 | -1.161283 | -1.137017 | 0.787399 | ... | 0.117417 | -0.324164 | -0.510348 | 0.801145 | -0.253922 | 0.568123 | 0.851370 | -0.558899 | -0.064633 | -0.407102 |
which | -0.368462 | -0.361856 | 0.409696 | 0.587431 | 0.708897 | 0.291840 | -0.362036 | -0.530712 | -0.053614 | -0.711788 | ... | -1.084328 | -0.832080 | 0.269114 | 0.851772 | -0.942639 | -0.081171 | -1.273354 | -0.796438 | -0.260538 | 0.355565 |
with | -0.388580 | -0.219293 | -0.652044 | -1.386387 | -0.089682 | -0.403928 | -1.394354 | 0.763693 | -0.379815 | -0.662670 | ... | -0.599527 | -0.076366 | 0.786938 | -1.619844 | -0.512546 | -0.580194 | -0.000503 | -0.181073 | -1.548287 | 0.417536 |
world | 0.737164 | -0.141081 | 0.328124 | 0.615530 | 0.357371 | 0.274039 | -0.563317 | 0.040565 | -0.111780 | 0.075683 | ... | -0.404257 | 0.543159 | 0.008046 | 0.476506 | 0.046138 | -0.633671 | -0.073570 | -0.446745 | -0.119588 | -0.238425 |
56 rows × 768 columns