import nltk, re
import pandas as pd
from spacy.lang.da.stop_words import STOP_WORDS
import lemmy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import bz2
import _pickle as cPickle
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display import Image


# Load data
data = bz2.BZ2File('../data/picl_data_l3.pbz2', 'rb')
data = cPickle.load(data)


# Remove formatting and law intro sentences: "Oversigt (indholdsfortegnelse)" and "Den fulde tekst"
data['tokens'] = [re.sub(r'\xa0|\\r\\n|\\r|\\n|Oversigt \(indholdsfortegnelse\)|Den fulde tekst',' ',w.lower()) for w in data['full_text']]

# Tokenize and remove punctuation and numbers
data['tokens'] = [nltk.word_tokenize(re.sub(r'[^\w\s]|\d',' ',w.lower())) for w in data['tokens']]

# Danish lemmatizer without word tags
lemmatizer = lemmy.load("da")
data['tokens'] = [[min(lemmatizer.lemmatize("", w)) for w in ws] for ws in data['tokens']]

# Remove stopwords and stk, nr, pkt, jf
STOP_WORDS.update(['stk', 'nr', 'pkt', 'jf'])
data['tokens'] = [[w for w in ws if w not in STOP_WORDS] for ws in data['tokens']]

# Remove words with less than 2 characters
data['tokens'] = [[w for w in ws if len(w) > 1] for ws in data['tokens']]

# Lower case
data['tokens'] = [[w.lower() for w in ws] for ws in data['tokens']]


# First create string out of tokens
data['string'] = [[] for _ in range(len(data))]
for i in range(len(data)):
    data['string'].values[i] = ' '.join(data['tokens'].values[i])
    
# Code from : https://medium.com/analytics-vidhya/demonstrating-calculation-of-tf-idf-from-sklearn-4f9526e7e78b
# Using sklearn is much faster than manually creating the matrices 
cv = CountVectorizer()
word_count_vector = cv.fit_transform(data['string'])
tf = pd.DataFrame(word_count_vector.toarray(), columns=cv.get_feature_names())

tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(word_count_vector)
idf = pd.DataFrame({'feature_name':cv.get_feature_names(), 'idf_weights':tfidf_transformer.idf_})

tf_idf = pd.DataFrame(X.toarray() ,columns=cv.get_feature_names())


# Now make dictionary with token:TF-IDF values for each law
tfidf = [{} for x in range(len(data))]
for i in range(len(data)):
    for t in set(data['tokens'].values[i]):
        tfidf[i][t] = tf_idf.loc[i][t]

# Save TF-IDF data as compressed bz2 pickle file
# with bz2.BZ2File('data/tfidf' + '.pbz2', 'w') as f:
#     cPickle.dump(tfidf, f)


# Creating processed corpus text and some basic stats
clean_corpus = []
for i in range(len(data)):
    clean_corpus += (data['tokens'].values[i])

print(f"Number of words in corpus: {len(clean_corpus)}")
print(f"Number of unique words in corpus: {len(set(clean_corpus))}")

Number of words in corpus: 6286773
Number of unique words in corpus: 55675


# Text statistics
word_count_per_law= pd.DataFrame([len(w) for w in data['tokens']], columns =['Wordcount per law'])
print(f"Law with the maximum wordcount in corpus: {data['title'].loc[word_count_per_law['Wordcount per law'].idxmax()]}, with {max(word_count_per_law['Wordcount per law']):d} words.")
print(f"Law with the maximum wordcount in corpus: {data['title'].loc[word_count_per_law['Wordcount per law'].idxmin()]}, with {min(word_count_per_law['Wordcount per law']):d} words.")
print(f"Median word count in laws: {word_count_per_law['Wordcount per law'].median():.0f} words.")
print(f"Percentage of laws with less than 50 words: {len([w for w in word_count_per_law['Wordcount per law'] if w < 50])/len(word_count_per_law):.1%}")

Law with the maximum wordcount in corpus: Forslag til Lov om en aktiv beskæftigelsesindsats, with 157473 words.
Law with the maximum wordcount in corpus: Lov om tillægsbevilling for finansåret 2021, with 8 words.
Median word count in laws: 758 words.
Percentage of laws with less than 50 words: 1.6%


# Histogram wordcount across documents in corpus
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
word_count_per_law.plot.hist(ax=ax[0], column='Wordcount per law', bins=25, grid=True, figsize=(10,8), color='maroon', log=True)
ax[0].set_title('Log scale histogram of wordcount per law', fontsize=12)
ax[0].set_xlabel('Wordcount')
ax[0].set_ylabel('Laws')
ax[0].get_legend().remove()

word_count_per_law[word_count_per_law['Wordcount per law'] < 1000].plot.hist(ax=ax[1], column='Wordcount per law', bins=25, grid=True, figsize=(10,8), color='maroon', log=True)
ax[1].set_title('Log scale histogram of wordcount up to 1000 per law', fontsize=12)
ax[1].set_xlabel('Wordcount')
ax[1].set_ylabel('Laws')

ax[1].get_legend().remove()

plt.suptitle('Wordcount per law', fontsize=16)
# fig.savefig('static/images/word_count_per_law.png')
plt.show()


# Cumulative word frequency distributions for corpus
word_count_clean = nltk.FreqDist(clean_corpus)
fig = plt.figure(figsize=(20, 10))
word_count_clean.plot(25,cumulative=True, title='Top 25 Cumulative word frequency distributions for corpus')
plt.show()
# fig.savefig('static/images/cumulative_word_frequency_corpus.png')


# Generate wordclouds based on TF-IDF values
wordcloud_data = []
for i in range(len(data)):
    wordcloud = WordCloud(background_color='white', width=600, height=400).generate_from_frequencies(tfidf[i])
    plt.figure(figsize=[15, 10])
    plt.imshow(wordcloud)
    plt.axis("off")
    filename = 'plots/wordclouds/'+str(data['id'].values[i])+'.png'
    wordcloud_data.append([data['id'].values[i], filename])
    print("Saving file nr.",i,filename)
    plt.savefig(filename, bbox_inches='tight',pad_inches = 0)
    plt.close()


Image(filename='../plots/wordclouds/219731.png')


import pandas as pd
import networkx as nx
from networkx.algorithms.community.centrality import girvan_newman
import matplotlib.pyplot as plt
import netwulf as nw
import bz2
import pickle
import _pickle as cPickle
data = bz2.BZ2File('../data/picl_data.pbz2', 'rb')
data = cPickle.load(data)


# Simple helper function for loading and writing processed data
def read(lvl):
    df = bz2.BZ2File(f'data/picl_data_l{lvl}.pbz2', 'rb') # Or data/urls_l1.pkl
    df = cPickle.load(df)
    url = pd.read_pickle(f'data/urls_l{lvl}.pkl')
    return df, url

def write(df):
    with bz2.BZ2File(f'data/picl_data_final__.pbz2', 'w') as f:
        cPickle.dump(df, f)


'''
This functions builds the graphs, one for graphs with historic nodes and referrences and one without
'''
def build(dff):
    G = nx.empty_graph(0, None)
    for node in dff[["id", "edgesUrl", "isHistorical", "stateLabel"]].itertuples():
        G.add_nodes_from(dff.id)
        if not node.isHistorical and node.stateLabel != None:
            G.add_node(node.id)
            for edge in node.edgesUrl:
                if not dff.id[dff.EliUrl.str.endswith(edge.split('dk/')[1])].empty:
                    nbrIsHis = dff.isHistorical[dff.EliUrl.str.endswith(edge.split('dk/')[1])].values.item()
                    nbrRatified = dff.stateLabel[dff.EliUrl.str.endswith(edge.split('dk/')[1])].values.item()
                    if not nbrIsHis and not nbrRatified:
                        nbr = dff.id[dff.EliUrl.str.endswith(edge.split('dk/')[1])]
                        G.add_edge(node.id, nbr.to_numpy().item())
                    # G.add_edges_from(((node, nbr) for _, node, nbrlist in [node for node in df[["EliUrl", "edgesUrl"]].itertuples()]))  # for nbr in nbrlist[1]))
    return G

def buildwithHist(df):
    G = nx.empty_graph(0, None)
    for node in df[["id", "edgesUrl", "isHistorical", "stateLabel"]].itertuples():
        G.add_nodes_from(df.id)
        if node.stateLabel != None:
            G.add_node(node.id)
            for edge in node.edgesUrl:
                if not df.id[df.EliUrl.str.endswith(edge.split('dk/')[1])].empty:
                #     nbrIsHis = df.isHistorical[df.EliUrl.str.endswith(edge.split('dk/')[1])].values.item()
                #     nbrRatified = df.stateLabel[df.EliUrl.str.endswith(edge.split('dk/')[1])].values.item()
                #     if not nbrIsHis and not nbrRatified:
                    nbr = df.id[df.EliUrl.str.endswith(edge.split('dk/')[1])]
                    G.add_edge(node.id, nbr.to_numpy().item())
                    # G.add_edges_from(((node, nbr) for _, node, nbrlist in [node for node in df[["EliUrl", "edgesUrl"]].itertuples()]))  # for nbr in nbrlist[1]))
    return G


'''
interEdges() is build to be able to read out the number of edges between the three levels
It is used in this way: 
dfSelfEdges{LVL} = interEdges(df.loc[df['rlvl'] == {LVL}], df.loc[df['rlvl'] == {LVL}])
nodeAttr() handles adding attribute data from our dataset
exclusive() was created to handle i mishab when initially concattenating the three levels without preserving the 
level attribute.
Fixing the mishab is done with these following self-explanatory lines of code:
    df1, _ = read(1)
    df2, _ = read(2)
    df3, _ = read(3)
    dfe2 = exclusive(df1, df2)
    dfe3 = exclusive(dfe2, df3)
    df1['rlvl'] = 1
    dfe2['rlvl'] = 2
    dfe3['rlvl'] = 3
    df = pd.concat([df1,dfe2,dfe3])
    df = df.drop_duplicates(subset=["id"])
    write(df)
'''
def interEdges(df1, df2):
    reverseEdge = 0
    for node in df2[["id", "edgesUrl"]].itertuples():
        for edge in node.edgesUrl:
            if not df1.id[df1.EliUrl.str.endswith(edge.split('dk/')[1])].empty:
                refIsHist = df1.isHistorical[df1.EliUrl.str.endswith(edge.split('dk/')[1])].values.item()
                refIsRat = df1.stateLabel[df1.EliUrl.str.endswith(edge.split('dk/')[1])].values.item()
                if not refIsHist and not refIsRat:
                    reverseEdge += 1
    return reverseEdge

def nodeAttr(G, df):
    for node in df.itertuples():
        sizeoftext =  np.log2(node.full_text.__sizeof__())
        # attrs = {node.id: {"lvl": node.rlvl, "group": node.ressort, "size": sizeoftext}}
        attrs = {node.id: {"group": node.rlvl, "ressort": node.ressort, "size": sizeoftext}}
        # attrs = {node.id: {"lvl": node.rlvl, "ressort": node.ressort, "size": sizeoftext, "group": node.documentTypeId}}
        nx.set_node_attributes(G, attrs)

def exclusive(df1, df2):
    exclusives = []
    for node in df2.itertuples(index=False):
        if df1.id[df1.id == node.id].empty:
            exclusives.append(node)
    return pd.DataFrame(exclusives, columns=df2.columns)


G = nx.DiGraph()

idToIndexMapper = {}

uniqueIndex = 0

for index, row in data.iterrows():
    parent_id = getIdFromUrl(row.url)
    if parent_id not in idToIndexMapper:
        idToIndexMapper[parent_id] = {"indexInGraph": uniqueIndex, "url": row.url}
        uniqueIndex += 1
    edges = row["metadata"]
    for edge in edges:
        for key in edge:
            edge_id = getIdFromUrl(key)
            if edge_id not in idToIndexMapper:
                idToIndexMapper[edge_id] = {"indexInGraph": uniqueIndex, "url": edge}
                uniqueIndex += 1
            G.add_edge(idToIndexMapper[parent_id]["indexInGraph"], idToIndexMapper[edge_id]["indexInGraph"])

nx.draw(G)


sorted(G.degree, key=lambda x: x[1], reverse=True)

for key in idToIndexMapper:
    if idToIndexMapper[key]["indexInGraph"] == 91:
        print("Highest degree", idToIndexMapper[key])

Highest degree {'indexInGraph': 91, 'url': {'https://www.retsinformation.dk/eli/lta/2021/1735': '{placeholder}'}}


communities = girvan_newman(G, most_valuable_edge=None)
node_groups = []
for com in next(communities):
    node_groups.append(list(com))

colors = ['blue', 'green', 'yellow', 'red', "cyan", "magenta", "#B23AEE", "#B4EEB4", "#FF1493", "#FFF68F"]
color_index = 0
color_map = {}
for node in G:
    color_index = 0
    node_added = False
    for node_group in node_groups:
        if node in node_group:
            color_map[node] = colors[color_index]
        color_index += 1
        node_added = True
    if node_added is False:
        print("sdfsd")
        color_map[node](colors[color_index])

nx.set_node_attributes(G, color_map, "group")


nw.visualize(G)

Data field	rlvl	id	title	documentTypeId	shortName	full_text	isHistorical	ressort	EliUrl	stateLabel	metadata	edges	edgesUrl
Description	Recursive level 1-3	Unique id for law	Official title	Type of document, e.g. Lov or Ændringslov	Shortname identifier	Full text of law	Is this law historical or applicable	Ministry responsible	API link	If law has been passed	Metadata, including date of publication	References to other laws	Links to references
Example	2	224291	Lov om ændring af lov om...	20	LOV nr 1439 af 29/06/2021	Den fulde tekstLov om ændring af lov ...	False	Beskæftigelsesministeriet	https://www.retsinformation.dk/api/document/eli/lta/2021/1439	Vedtaget	{'displayName': 'Offentliggørelsesdato', 'displayValue': '30/06/2021'}	['LOV nr 1641 af 19/11/2020', 'LBK nr 2566 af 13/12/2021']	['https://www.retsinformation.dk/eli/lta/2020/1641', 'https://www.retsinformation.dk/eli/ft/202013LA0235', ...]

	Level 1	Level 2	Level 3
Laws	343	509	749
References post-clean	2003	4191	11440

Project Assignment¶

Group 6¶

Link to Git repository¶

Contribution statement¶

Motivation¶

1. What is your dataset?¶

2. Why did you choose this/these particular dataset(s)?¶

3. What was your goal for the end user's experience?¶

Basic stats. Let's understand the dataset better¶

Tools, theory and analysis¶

Describe which network science tools and data analysis strategies you've used, how those network science measures work, and why the tools you've chosen are right for the problem you're solving.¶

How did you use the tools to understand your dataset?¶

Text analysis¶

Data processing workflow¶

Calculate TF-IDF for each token¶

Text analysis¶

Generate Wordclouds¶

Evalution of text processing and Worcloud generation¶

Network analysis¶

Create graph¶

Graph builder¶

Find communites¶

Discussion¶