In [None]:
from IPython.core.display_functions import display

In [None]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import graph_tool.all as gt

import torch

from torch_geometric.data import Data

import qrcode

In [None]:
planets = sns.load_dataset("planets")

In [None]:
input_data = "https://marcograssia.com/talk/mscx_2022_warmup/"

qr = qrcode.QRCode(
        version=1,
        box_size=20,
        border=5
)
qr.add_data(input_data)
qr.make(fit=True)
# img = qr.make_image(fill='black', back_color='white')
img = qr.make_image(fill='#111', back_color='white')
img.save('qrcode001.png')

In [None]:
from traitlets.config.manager import BaseJSONConfigManager
from pathlib import Path

path = Path.home() / ".jupyter" / "nbconfig"
cm = BaseJSONConfigManager(config_dir=str(path))
tmp = cm.update(
    "rise", {
        "footer": "<h3 class='text-muted'></h3>",
        "header": "<h3 class='text-right text-muted'>MCSX 2022: <i>python</i> warmup - 26 Jun 2022</h3>",
        "start_slideshow_at": "selected",
        "theme": "night",
        # "theme": "sky",
        # "theme": "serif",
        "transition": "fade",
        "autolaunch": True,
        "width": "100%",
        "height": "100%",
        "scroll": True,
        "enable_chalkboard": True,
        "slideNumber": True,
        "center": True,
        "controlsLayout": "edges",
        "hash": True,
        "auto_select": "code",
        "auto_select_fragment": True,
    }
)

#     "enable_chalkboard": true,
#     "footer": "<h3 class='text-muted'></h3>",
#     "hash": true,
#     "header": "<h3 class='text-muted'>MCSX Summer School 2022: <i>python</i> warmup</h3>",
#     "scroll": true,
#     "slideNumber": true,
#     "start_slideshow_at": "selected",
#     "theme": "night"

# Network analysis with Python 

Overview of the most used Python Libraries for Data and Network Analysis.

MCSX Summer School 2022 - 26 Jun 2022

### Marco Grassia
Research Fellow @ University of Catania, Italy

### This notebook is available at https://bit.ly/3u2S2Li
<br>
<img src="qrcode001.png" style="height: 550px">

https://marcograssia.com/talk/mscx_2022_warmup/


# Warmup?
Python is the most widely used programming language in data analysis.
In this warmup, we will introduce the most popular and powerful libraries
for Network Science (i.e., graph-tool, NetworkX),
for data analysis and visualization (Matplotlib, NumPy, SciPy, Pandas, Seaborn),
and for Deep Learning on graphs (PyTorch Geometric). 

## In this warm-up

We will have an **overview** of _Python_ and of the following **libraries**

# _Python_

## What is _python_

> Python is an *interpreted*, *object-oriented*, *high-level* programming language with dynamic semantics. [...]

> It is suitable for **rapid development** and for use as a "**glue language**" to connect various components (e.g., written in different languages).

_Python_ is **one of the most used programming languages**[1]

[1]: [StackOverflow's 2021 survey](https://insights.stackoverflow.com/survey/2021)

### Why is _python_ so popular?

Its popularity can be rooted to its characteristics
- _Python_ is fast to learn, very versatile and flexible
- It is very high-level, and complex operations can be performed with few lines of code

And to its large user-base:
- **Vast assortment of libraries**
- **Solutions to your problems may be already available** (e.g., on StackOverflow)

<!-- ![](stackoverflow_2021_languages.png) -->


### Some real-world applications

- Data science (loading, processing and plotting of data)
- Scientific and Numeric computing
- Modeling and Simulation
- Web development
- Artificial Intelligence and Machine Learning
- Image and Text processing
- Scripting

## How to get _python_?

Using the **default environment that comes with your OS is not a great idea**:
- Usually older _python_ versions are shipped
- You cannot upgrade the _python_ version or the libraries freely
- Some functions of your OS may depend on it

You can either:
1. Download _python_
2. Use a distribution like _Anaconda_

### Anaconda
_Anaconda_ is a _python_ distribution that packs the most used libraries for data analysis, processing and visualization

_Anaconda_ installations are managed through the _conda_ package manager

_Anaconda "distribution"_ is free and open source

<img src="images/Distro01.webp" style="max-height:450px;">

[Anaconda distribution](https://www.anaconda.com/products/distribution)

# _Python_ virtual **environments**
> A virtual **environment** is a Python environment such that the Python interpreter, libraries and scripts installed into it are isolated from those installed in other virtual environments

Environments are used to freeze specific interpreter and libraries versions for your projects

If you start a new project and need newer libraries, just create a new environment

You won't have to worry about breaking the other projects

<!-- If you upgrade the libraries or interpreter in one, you do not have to worry breaking the others

They are useful, for instance, when starting a new project or when upgrading your libraries -->
<!-- , ... for a project, then use different versions for another. -->

<!-- > , and (by default) any libraries installed in a “system” Python, i.e., one which is installed as part of your operating system. -->

<!-- “virtual” isolated Python installation and install packages into that virtual installation. When you switch projects, you can simply create a new virtual environment and not have to worry about breaking the packages installed in the other environments. It is always recommended to use a virtual environment while developing Python applications. -->



#### Environment creation
> conda create --name <ENV_NAME> [<PACKAGES_LIST>] [--channel <CHANNEL>]

You can also specify additional channels to search for packages (in order)

Example:

> conda create --name gt python=3.9 graph-tool pytorch torchvision torchaudio cudatoolkit=11.3 pyg seaborn numpy scipy matplotlib jupyter -c pyg -c pytorch -c nvidia -c anaconda -c conda-forge

#### Switch environment
> conda activate \<ENV_NAME\>


Example
> conda activate gt

[Full documentation](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html)

## Data analysis and visualization libraries
<!-- In this warm-up, we will have an **overview** of the following **data analysis and visualization libraries**: -->
- NumPy
- SciPy
- Matplotlib
- Seaborn
- Pandas

These libraries are general, and can be used also in Network Analysis

## Network analysis and visualization libraries
<!-- In this warm-up,  -->
- graph-tool
- PyTorch Geometric (PyG)

## NumPy
<!-- : multidimensional arrays & mathematical functions -->

> _**NumPy**_ is the fundamental package for scientific computing in Python.

_NumPy_ offers new data structures:
- **Multidimensional array** objects (the **_ndarray_**)
- Various derived objects (like **_masked arrays_**)

<!-- - including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms, basic linear algebra, basic statistical operations, random simulation and much more -->

And also a **vast assortment of functions**:
  - **Mathematical functions** (arithmetic, trigonometric, hyperbolic, rounding, ...)
  - **Sorting, searching, and counting**
  - Operations on **Sets**
  - Input and output 
  - **Fast Fourier Transform**
  - Complex numbers handing
  - (pseudo) random number generation

_NumPy_ is fast:
- Its core is written in C/C++
- Arrays are stored in contiguous memory locations

It also offers tools for integrating C/C++ code

**Many libraries are built on top of NumPy**'s arrays and functions.

<!-- 
Binary operations

- Trigonometric functions
    Hyperbolic functions
    Rounding
    Sums, products, differences
    Exponents and logarithms
    Other special functions
    Floating point routines
    Rational routines
    Arithmetic operations
    Handling complex numbers
    Extrema Finding
    Random sampling (numpy.random)


It provides:

    a powerful N-dimensional array object
    sophisticated (broadcasting) functions
    
    useful linear algebra, Fourier transform, and random number capabilities


It also offers 


, random number generators, linear algebra routines, Fourier transforms, and more.

<!-- NumPy is the fundamental package for scientific computing in Python. -->

<!-- It is a Python library that provides a multidimensional array object, various derived objects (such as masked arrays and matrices), and an assortment of routines for fast operations on arrays, including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms, basic linear algebra, basic statistical operations, random simulation and much more. -->

<!-- NumPy arrays -->

#### The ***ndarray***
<table style="background-color:#FFF;">
<tr>
<th> Mono-dimensional </th>
<th> Multi-dimensional </th>
</tr>
<tr>
<td>
  
<img src="images/array.png"  style="height:350px;">
    
</td>
<td>

<img src="images/mdim_array.png" style="height:350px;">
    
<!-- ![](images/mdim_array.png) -->
</td>
</tr>
</table>



[NumPy documentation](https://numpy.org/devdocs/index.html)

<!-- [NumPy reference](https://numpy.org/devdocs/reference/index.html) -->

In [None]:
import numpy as np

In [None]:
np.random.rand(3)

In [None]:
np.random.rand(1, 3)

In [None]:
np.random.randint(10, size=(2, 2, 2))

## _SciPy_
<!-- : fundamental algorithms for scientific computing in Python  -->

**_SciPy_** is a **collection of mathematical algorithms** and convenience functions **built on the NumPy library**

_SciPy_ is written in C and Fortran, and provides:
- Algorithms for **optimization, integration, interpolation, eigenvalue problems, algebraic equations, differential equations, statistics, etc**.
- Specialized data structures, such as **sparse matrices** and k-dimensional trees
- Tools for the interactive Python sessions

_SciPy_'s main **subpackages** include:

- Data clustering algorithms
- Physical and mathematical constants
- Fast Fourier Transform routines
- Integration and ordinary differential equation solvers
- Linear algebra
- ...

- ...
- N-dimensional image processing
- Optimization and root-finding routines
- Signal processing
- Sparse matrices and associated routines
- Spatial data structures and algorithms
- Statistical distributions and functions

In [None]:
import scipy as sp

[SciPy documentation](https://docs.scipy.org/doc/scipy/index.html)

### Sparse matrices

There are many sparse matrices implementations, each optimized for different operations.

For instance:
- **Coo**rdinate (COO)
- **Li**nked **L**ist Matrix (LIL)
- **C**ompressed **S**parse **R**ow (CSR)
- **C**ompressed **S**parse **C**olumn (CSC)

Check this nice tutorial for more!
[Sparse matrices tutorial](https://matteding.github.io/2019/04/25/sparse-matrices/)

## Pandas

*pandas* allows easy data organization, filtering, analysis and plotting

*pandas* provides **data structures** for **“relational” or “labeled” data**, for instance:
<!-- Pandas is a python library that provides high-performance, easy-to-use data structures and data analysis tools. -->
<!-- For instance, they are well suited for: -->
- **Tabular data** with **heterogeneously-typed columns**, as in an Excel spreadsheet
- Ordered and unordered **time series data**
- **Arbitrary matrix data** (even heterogeneous typed) with row and column labels
<!-- - Any other form of observational / statistical data sets. The data need not be labeled at all to be placed into a pandas data structure -->
 <!-- is a Python package providing  designed to make working with “relational” or “labeled” data both easy and intuitive. -->

The two primary data structures provided are the:
- **Series** (1-dimensional)
- **DataFrame** (2-dimensional)

<table style="background-color:#FFF;">
<tr>
<th> Series</th>
<th> DataFrame </th>
</tr>
<tr>
<td>
  
<img src="images/series.png"  style="height:450px;">
    
</td>
<td>

<img src="images/dataframe.png" style="height:450px;">
    
<!-- ![](images/mdim_array.png) -->
</td>
</tr>
</table>


These structures heavily rely on _NumPy_ and its arrays

_pandas_ integrates well with other libraries built on top of _NumPy_

<!-- and its data structures are built on top of NumPy and  -->
<!-- - , handle the vast majority of typical use cases in finance, statistics, social science, and many areas of engineering. For R users, DataFrame provides everything that R’s data.frame provides and much more. -->

 <!-- It aims to be the fundamental high-level building block for doing practical, real-world data analysis in Python. -->
 
 <!-- Additionally, it has the broader goal of becoming the most powerful and flexible open source data analysis/manipulation tool available in any language. -->
 
 <!-- It is already well on its way toward this goal. -->


 <!-- intended to integrate well within a scientific computing environment with many other 3rd party libraries. -->

<!-- Data is stored as NumPy arrays -->


<!-- Many of these principles are here to address the shortcomings frequently experienced using other languages / scientific research environments.
For data scientists, working with data is typically divided into multiple stages: munging and cleaning data, analyzing / modeling it, then organizing the results of the analysis into a form suitable for plotting or tabular display. -->

<!-- pandas is the ideal tool for all of these tasks. -->

 #### Supported file formas
 _pandas_ can recover data from/store data to SQL databases, Excel, CSVs... 

 <img src="images/02_io_readwrite.svg" height="400px">

In [None]:
import pandas as pd

[pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html)

#### DataFrame example
Penguins example dataset from the Seaborn package

In [None]:
penguins = sns.load_dataset("penguins")
display(penguins)

##### Series of the species

In [None]:
penguins["species"]

##### Unique species

In [None]:
penguins["species"].unique()

##### Average bill length

In [None]:
penguins["bill_length_mm"].mean()

##### Standard deviation of the bill length

In [None]:
penguins["bill_length_mm"].std()

##### Data filtering for male penguins

In [None]:
penguins["sex"] == "Male"

In [None]:
penguins.loc[                         # .loc property
    penguins["sex"] == "Male"         # Row filter (boolean)
]

##### Average bill length for male penguins

In [None]:
penguins.loc[ penguins["sex"] == "Male",   # Mask (row filter)
              "bill_length_mm",            # Column filter
].mean()

##### Average bill length and weight for female penguins

In [None]:
penguins.loc[penguins["sex"] == "Female",                # Mask (row filter)
             ["bill_length_mm", "body_mass_g"]           # Column filter
            ].mean()

## _Matplotlib_

> _Matplotlib_ is a comprehensive library for creating static, animated, and interactive visualizations in _Python_
<!--  . Matplotlib makes easy things easy and hard things possible. -->

[Matplotlib documentation](https://matplotlib.org/stable/index.html)

### Some plot examples 

From the [Matplotlib gallery](https://matplotlib.org/stable/gallery/index.html)

#### Line plots
  
<img src="images/sphx_glr_csd_demo_001.webp" height="350px">

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/csd_demo.html#sphx-glr-gallery-lines-bars-and-markers-csd-demo-py)

#### Scatterplots and histograms

<img class="r-stretch" src="images/sphx_glr_scatter_hist_001.webp" style="height:550px;">
<!-- ![](images/sphx_glr_scatter_hist_001.webp) -->

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/scatter_hist.html#sphx-glr-gallery-lines-bars-and-markers-scatter-hist-py)

#### Barplots

##### Simple barplot

<img src="images/sphx_glr_barh_001.webp" style="max-height:550px;">

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/barh.html#sphx-glr-gallery-lines-bars-and-markers-barh-py)

##### Stacked barplot

<img src="images/sphx_glr_bar_stacked_001.webp" style="max-height:550px;">

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/bar_stacked.html#sphx-glr-gallery-lines-bars-and-markers-bar-stacked-py)

##### Grouped barplot
<img src="images/sphx_glr_barchart_001.webp" style="max-height:550px;">

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py)

#### Horizontal bar chart
<img src="images/sphx_glr_horizontal_barchart_distribution_001.webp" style="max-height:650px;">

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/horizontal_barchart_distribution.html#discrete-distribution-as-horizontal-bar-chart)

### (Nested) pie charts

<img src="images/sphx_glr_nested_pie_001.webp" style="max-height:350px;">

[Source](https://matplotlib.org/stable/gallery/pie_and_polar_charts/nested_pie.html#sphx-glr-gallery-pie-and-polar-charts-nested-pie-py)

#### Heatmaps

<img src="images/sphx_glr_image_annotated_heatmap_001.webp" style="max-height:650px;">

[Source](https://matplotlib.org/stable/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py)

#### Violin and box plots
<img src="images/sphx_glr_boxplot_vs_violin_001.webp" style="max-height:650px;">

[Source](https://matplotlib.org/stable/gallery/statistics/boxplot_vs_violin.html#sphx-glr-gallery-statistics-boxplot-vs-violin-py)

#### Stackplots 

<img src="images/sphx_glr_stackplot_demo_001.webp" style="max-height:650px;">

[Source](https://matplotlib.org/stable/gallery/lines_bars_and_markers/stackplot_demo.html#sphx-glr-gallery-lines-bars-and-markers-stackplot-demo-py)

#### ... and many more

In [None]:
import matplotlib.pyplot as plt

## _Seaborn_

> _Seaborn_ is a library for making statistical graphics in Python

Thanks to its **high-level interface**, it makes plotting very complex figures easy

_Seaborn_ **builds on top of _matplotlib_** and **integrates** closely **with _pandas_ data structures**

In [None]:
import seaborn as sns

It provides **helpers to improve how all matplotlib plots look**:
- Theme and style
- Colors (even colorblind palettes)
- Scaling, to quickly switch between presentation contexts (e.g., plot, poster and talk)

In [None]:
sns.reset_defaults()

In [None]:
plt.plot(range(10), range(10))

In [None]:
sns.set_theme(context="talk",
              style="ticks",
              palette="deep",
              font="sans-serif",
#               font_scale=1,
              color_codes=True,
              rc={
                  'figure.facecolor': 'white'
#                   'figure.figsize': (10, 6),
#                   "text.usetex": True,
#                   "font.family": "sans-serif",
              },
            
              )

In [None]:
plt.plot(range(10), range(10))
sns.despine()

_Seaborn's_ _FacetGrid_ offers a convenient way to visualize multiple plots in grids

They can be drawn with up to three dimensions: rows, columns and hue

[Tutorial: Building structured multi-plot grids](https://seaborn.pydata.org/tutorial/axis_grids.html)

In [None]:
penguins.head()

In [None]:
g = sns.relplot(data=penguins,
                x="flipper_length_mm",
                y="bill_length_mm",
                col="sex",
                hue="body_mass_g"
               )
g.set_axis_labels("Flipper length (mm)", "Bill length (mm)")

In [None]:
g = sns.relplot(data=penguins,
                x="flipper_length_mm",
                y="bill_length_mm",
                row="sex",
                col="species",
                hue="body_mass_g"
               )
g.set_axis_labels("Flipper length (mm)", "Bill length (mm)")

<!-- , even if you don’t make them with seaborn -->

<!--     Seaborn helps you explore and understand your data. Its plotting functions operate on dataframes and arrays containing whole datasets and internally perform the necessary semantic mapping and statistical aggregation to produce informative plots. Its dataset-oriented, declarative API lets you focus on what the different elements of your plots mean, rather than on the details of how to draw them
 -->
<!--     Seaborn is a Python data visualization library based on matplotlib.
    It provides a  high-level interface for drawing attractive and informative statistical graphics. -->

<!--     For a brief introduction to the ideas behind the library, you can read the introductory notes or the paper. Visit the installation page to see how you can download the package and get started with it. You can browse the example gallery to see some of the things that you can do with seaborn, and then check out the tutorial or API reference to find out how. -->
<!--     To see the code or report a bug, please visit the GitHub repository. General support questions are most at home on stackoverflow or discourse, which have dedicated channels for seaborn.

    # Apply the default theme
    sns.set_theme()

    This uses the matplotlib rcParam system and will affect how all matplotlib plots look, even if you don’t make them with seaborn. Beyond the default theme, there are several other options, and you can independently control the style and scaling of the plot to quickly translate your work between presentation contexts (e.g., making a version of your figure that will have readable fonts when projected during a talk). If you like the matplotlib defaults or prefer a different theme, you can skip this step and still use the seaborn plotting functions. -->

[Seaborn documentation](https://seaborn.pydata.org/)

### Some more plot examples

From [Seaborn example gallery](https://seaborn.pydata.org/examples/index.html)



#### Smooth kernel density with marginal histograms [(source)](https://seaborn.pydata.org/examples/smooth_bivariate_kde.html)
<!-- ![](images/smooth_bivariate_kde.png) -->

In [None]:
g = sns.JointGrid(data=penguins, x="body_mass_g", y="bill_depth_mm", space=0)
g.plot_joint(sns.kdeplot,
             fill=True, clip=((2200, 6800), (10, 25)),
             thresh=0, levels=100, cmap="rocket")
g.plot_marginals(sns.histplot, color="#03051A", alpha=1, bins=25)

#### Joint and marginal histograms
<!-- 
<img src="images/joint_histogram.png" style="height:450px">

sns.set_theme(style="ticks")

# Load the planets dataset and initialize the figure
planets = sns.load_dataset("planets")
g = sns.JointGrid(data=planets, x="year", y="distance", marginal_ticks=True)

# Set a log scaling on the y axis
g.ax_joint.set(yscale="log")

# Create an inset legend for the histogram colorbar
cax = g.figure.add_axes([.15, .55, .02, .2])

# Add the joint and marginal histogram plots
g.plot_joint(
    sns.histplot, discrete=(True, False),
    cmap="light:#03012d", pmax=.8, cbar=True, cbar_ax=cax
)
g.plot_marginals(sns.histplot, element="step", color="#03012d") -->

[Source](https://seaborn.pydata.org/examples/joint_histogram.html)

In [None]:
g = sns.JointGrid(data=planets, x="year", y="distance", marginal_ticks=True)

# Set a log scaling on the y axis
g.ax_joint.set(yscale="log")

# Create an inset legend for the histogram colorbar
cax = g.figure.add_axes([.15, .55, .02, .2])

# Add the joint and marginal histogram plots
g.plot_joint(
    sns.histplot, discrete=(True, False),
    cmap="light:#03012d", pmax=.8, cbar=True, cbar_ax=cax
)
g.plot_marginals(sns.histplot, element="step", color="#03012d")

#### Custom projections

<img src="images/radial_facets.png" style="max-height:350px;">

[Source](https://seaborn.pydata.org/examples/radial_facets.html)

#### Discovering structure in heatmap data

<img src="images/structured_heatmap.png" style="height:650px">

[Source](https://seaborn.pydata.org/examples/structured_heatmap.html)

#### Bivariate plot with multiple elements

<img src="images/layered_bivariate_plot.png" style="height:450px">

[Source](https://seaborn.pydata.org/examples/layered_bivariate_plot.html)

# Network Analysis with Python

Three main libraries:
- _graph-tool_
- _networkx_
- _python-igraph_

<!--  collection of tools for creating, manipulating and analysing graphs. It can be used from C, R, Python and Mathematica programming languages  -->

## _graph-tool_

_graph-tool_ is a **graph analysis** library for _Python_

It provides the ***Graph*** data structure, and various algorithms

It is mostly written in C++, and based on the Boost Graph Library

It supports multithreading and it is fairly easy to extend

Built in algorithms:
- **Topology** analysis tools
- **Centrality**-related **algorithms**
- **Clustering coefficient** (transitivity) algorithms
- **Correlation** algorithms, like the **assortativity**
- **Dynamical processes** (e.g., SIR, SIS, ...)
- Graph **drawing** tools
- **Random graph** generation
- **Statistical inference of generative network models**
- **Spectral properties** computation

[Documentation](https://graph-tool.skewed.de/static/doc/index.html)

In [None]:
import graph_tool.all as gt

## NetworkX
Easy to use package for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks. Written in Python, it is the most feature.... of the three;

 Data structures for graphs, digraphs, and multigraphs
Many standard graph algorithms
Network structure and analysis measures
Generators for classic graphs, random graphs, and synthetic networks

Performance comparison (source: [graph-tool.skewed.de/](https://graph-tool.skewed.de/performance))


| Algorithm                     	| graph-tool (16 threads) 	| graph-tool (1 thread) 	| igraph                                          	| NetworkX                                             	|
|-------------------------------	|-------------------------	|-----------------------	|-------------------------------------------------	|------------------------------------------------------	|
| Single-source shortest path   	|  0.0023 s               	|  0.0022 s             	|  0.0092 s                                       	|    0.25 s                                            	|
| Global clustering             	|  0.011 s                	|  0.025  s             	|  0.027 s                                        	|    7.94 s                                            	|
| PageRank                      	|  0.0052 s               	|  0.022  s             	|  0.072 s                                        	|    1.54 s                                            	|
| K-core                        	|  0.0033 s               	|  0.0036 s             	|  0.0098 s                                       	|    0.72 s                                            	|
| Minimum spanning tree         	|  0.0073 s               	|  0.0072 s             	|  0.026 s                                        	|    0.64 s                                            	|
| Betweenness                   	|  102 s (~1.7 mins)      	|  331 s (~5.5 mins)    	|  198 s (vertex)  + 439 s (edge)  (~ 10.6 mins)  	|    10297 s (vertex)   13913 s (edge)  (~6.7 hours)   	|

## How to load a network

0. Choose a graph analysis library.
   The right one mostly depends on your needs (e.g., functions, performance, etc.)

    In this warm-up, we will use _graph-tool_.

1. To load the network, we need to use the right loader function, which depends on the file format

### File formats

Many ways to represent and store graphs.

The most popular ones are:
- edgelist
- GraphML

For more about file types, check the [NetworkX documentation](https://networkx.org/documentation/stable/reference/readwrite/index.html)

### Edgelist (.el, .edge, ...)
As the name suggests, it is a list of node pairs (source, target) and edge properties (if any).
Edgelists cannot store any information about the nodes, or about the graph (not even about the directedness)

Values may be separated by commas, spaces, tabs, etc.
Comments may be supported by the reader function.

Example file:

```csv
# source, target, weight
0,1,1
0,2,2
0,3,2
0,4,1
0,5,1
0,6,1
1,18,1
1,3,1
1,4,2
2,0,1
2,25,1
#...
```

### GraphML (.graphml, .xml)

Flexible format based on XML.

It can store hierarchical graphs, information (i.e., attributes or properties) about the graph, the nodes and the edges.

Main drawback: heavy disk usage (space, and I\O time)


Example of the file:
```html
<?xml version="1.0" encoding="UTF-8"?>
<graphml xmlns="http://graphml.graphdrawing.org/xmlns"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">

  <!-- property keys -->
  <key id="key0" for="node" attr.name="_pos" attr.type="vector_float" />
  <key id="key1" for="graph" attr.name="citation" attr.type="string" />
  <key id="key2" for="graph" attr.name="description" attr.type="string" />
  <!-- [...] -->
  <key id="key8" for="edge" attr.name="weight" attr.type="short" />

  <graph id="G" edgedefault="directed" parse.nodeids="canonical" parse.edgeids="canonical" parse.order="nodesfirst">

   <!-- graph properties -->
   <data key="key1">[&apos;J. S. Coleman. &quot;Introduction to Mathematical Sociology.&quot; London Free Press Glencoe (1964), http://www.abebooks.com/Introduction-Mathematical-Sociology-COLEMAN-James-S/189127582/bd&apos;]</data>
   <data key="key2">A network of friendships among male students in a small high school in Illinois from 1958. An arc points from student i to student j if i named j as a friend, in either of two identical surveys (from Fall and Spring semesters). Edge weights are the number of surveys in which the friendship was named.</data>

   <!-- [...] -->

   <!-- vertices -->
    <node id="n0">
      <data key="key0">0.92308158331278289, 12.186082864409657</data>
    </node>
    <node id="n1">
      <data key="key0">1.2629064355495019, 12.213213242633238</data>
    </node>
    <node id="n2">
      <data key="key0">1.1082744694986855, 12.190211909578192</data>
    </node>

    <!-- [...] -->

    <!-- edges -->
    <edge id="e0" source="n0" target="n1">
      <data key="key8">1</data>
    </edge>
    <edge id="e1" source="n0" target="n2">
      <data key="key8">2</data>
    </edge>
    <edge id="e2" source="n0" target="n3">
      <data key="key8">2</data>
    </edge>
    <edge id="e3" source="n0" target="n4">
      <data key="key8">1</data>
    </edge>

    <!-- [...] -->

   </graph>
</graphml>

```

# How to load a network

0. Choose a graph analysis library.
   The right one mostly depends on your needs (e.g., features, performance, etc.)

    In this warm-up, we will use _graph-tool_.

1. To load the network, we need to use the right loader function, which depends on the file format
   

2. After identifying the file format and the right loader function, we load the network

In [None]:
g = gt.load_graph("highschool.graphml")

display(g)

In [None]:
display(g.graph_properties)

In [None]:
display(g.vertex_properties)

In [None]:
display(g.edge_properties)

## Some network analysis


#### Get the number of nodes

In [None]:
number_of_nodes = g.num_vertices()
display(f"Number of nodes: {number_of_nodes}")

#### Get the number of edges

In [None]:
number_of_edges = g.num_edges()
display(f"Number of edges: {number_of_edges}")

#### Get the in and out degrees

In [None]:
in_degree = g.get_in_degrees(g.get_vertices(), eweight=None)

In [None]:
average_in_degree = np.mean(in_degree)

display("Average in degree", average_in_degree)

In [None]:
out_degree = g.get_out_degrees(g.get_vertices(), eweight=None)

In [None]:
average_out_degree = np.mean(out_degree)
display("Average out degree", average_out_degree)

#### In-degree distribution

In [None]:
p = plt.hist(in_degree)
 
plt.ylabel("Count")
plt.xlabel("In-degree")
sns.despine()

In [None]:
p = sns.histplot(in_degree,
                 stat="count",
                 discrete=True,
)
p.set_xlabel("In-degree")
sns.despine()

In [None]:
p = sns.histplot(in_degree,
             stat="percent",
             discrete=True,
)
p.set_xlabel("In-degree")
sns.despine()

In [None]:
sns.histplot(in_degree,
             stat="frequency",
             discrete=True,
)
plt.xlabel("In-degree")
sns.despine()

In [None]:
sns.histplot(in_degree,
             stat="percent",
             discrete=True,
             label="In-degree",
             legend=True,

)
sns.histplot(out_degree,
             stat="percent",
             discrete=True,
             label="Out-degree",
             legend=True,
)

plt.xlabel("Degree")
plt.legend()
sns.despine()

In [None]:
cmap = sns.color_palette("deep", n_colors=2)

cmap

In [None]:
sns.histplot(in_degree,
             stat="probability",
             discrete=True,
             label="In-degree",
             legend=True,
             color=cmap[0],
             alpha=0.6,
)
sns.histplot(out_degree,
             stat="probability",
             discrete=True,
             label="Out-degree",
             legend=True,
             color=cmap[1],
             alpha=0.6,
)

plt.title("Degree distribution")
plt.xlabel("Degree")
plt.legend()
sns.despine()

#### Get the in and out strength

In [None]:
weight = g.edge_properties["weight"]

in_strength = g.get_in_degrees(g.get_vertices(), eweight=weight)

out_strength = g.get_out_degrees(g.get_vertices(), eweight=weight)

In [None]:
sns.histplot(in_strength,
             stat="probability",
             discrete=False,
             label="In-strength",
             legend=True,
             color=cmap[0],
             alpha=0.6,
)

sns.histplot(out_strength,
             stat="probability",
             discrete=False,
             label="Out-strength",
             legend=True,
             color=cmap[1],
             alpha=0.6,
)

plt.title("Strength distribution")
plt.xlabel("Strength")
plt.legend()
sns.despine()

### Store the values as a DataFrame

In [None]:
df = pd.DataFrame(
    data={
        ("Degree", "In"): in_degree,
        ("Degree", "Out"): out_degree,

        ("Strength", "In"): in_strength,
        ("Strength", "Out"): out_strength,
    },
)

df.head()

### ... and plot the DF using _Seaborn_

In [None]:
melted_df = pd.melt(df, var_name=["Kind", "Direction"], value_name="Value")

melted_df

In [None]:
facet = sns.displot(melted_df,
                    x="Value",
                    kind="hist",
                    row="Kind",
                    col="Direction",
                    hue="Direction",
                    )

In [None]:
melted_df["Kind"] = melted_df["Kind"].apply(str.lower)

facet = sns.displot(melted_df,
                    x="Value",
                    kind="hist",
                    row="Kind",
                    col="Direction",
                    hue="Direction",
                    )

facet.set_titles(template="{col_name}-{row_name}")

## Graph visualization

#### 1. Compute the node layout

In [None]:
pos = gt.fruchterman_reingold_layout(g, n_iter=1000)

#### 2. Plot the network

In [None]:
gt.graph_draw(g, pos=pos,
              bg_color="#111",
              )

#### Add the edge weight

In [None]:
gt.graph_draw(g,
              pos=pos,
              edge_pen_width=g.edge_properties["weight"],
              bg_color="#111",
              )

#### More layouts

In [None]:
pos = gt.sfdp_layout(g)

gt.graph_draw(g, pos=pos,
              edge_pen_width=g.edge_properties["weight"],
              bg_color="#111",
              )

In [None]:
pos = gt.arf_layout(g)

gt.graph_draw(g,
              pos=pos,
              bg_color="#111",
              )

In [None]:
pos = gt.random_layout(g)

gt.graph_draw(g,
              pos=pos,
              edge_pen_width=g.edge_properties["weight"],
              bg_color="#111",
              )

## Centrality computation

In [None]:
gw = gt.GraphView(g, vfilt=gt.label_largest_component(g))

#### PageRank

In [None]:
pr = gt.pagerank(g)

In [None]:
sns.displot(pr.a)
plt.xlabel("PageRank")

#### Betweenness

In [None]:
vertex_betweenness, edge_betweenness =  gt.betweenness(g)

In [None]:
sns.displot(vertex_betweenness.a)
plt.xlabel("Vertex betweenness") 

In [None]:
sns.displot(edge_betweenness.a)
plt.xlabel("Edge betweenness")

In [None]:
gt.graph_draw(gw,
              pos=g.vp["_pos"],
              vertex_fill_color=pr, vorder=pr,
              edge_color=edge_betweenness,
              vertex_size=gt.prop_to_size(pr, mi=5, ma=15),
              vcmap=sns.color_palette("gist_heat", as_cmap=True),
              ecmap=sns.color_palette("rocket", as_cmap=True),
              edge_pen_width=g.edge_properties["weight"],
              bg_color="white",
              )

## Inferring modular structure

In [None]:
# state = gt.minimize_blockmodel_dl(g)
state = gt.minimize_nested_blockmodel_dl(g)

In [None]:
state.draw()

In [None]:
levels = state.get_levels()

In [None]:
gt.graph_draw(g,
              pos=g.vp["_pos"],
              vertex_fill_color=levels[0].get_blocks(),
              edge_color=edge_betweenness,
              vertex_size=gt.prop_to_size(pr, mi=5, ma=15),
              vorder=pr,
              vcmap=sns.color_palette("tab10", as_cmap=True),
              ecmap=sns.color_palette("rocket", as_cmap=True),
              edge_pen_width=g.edge_properties["weight"],
              bg_color="white",
              )

### Add some columns to our DataFrame

In [None]:
df["PageRank"] = pr.a

df["Betweenness"] = vertex_betweenness.a

df["Block"] = levels[0].get_blocks().a

df.head()

### Save the DataFrame


#### To CSV (Comma Separated Values)

In [None]:
df.to_csv("dataframe.csv")

#### To Excel

In [None]:
df.to_excel("dataframe.xlsx")

## PyTorch Geometric (PyG)

> _PyG_ is a library built upon _PyTorch_ to easily write and train _Graph Neural Networks_ (GNNs) for a wide range of applications related to structured data.

Library for Deep Learning on graphs

It provides a large collection of GNN and pooling layers

New layers can be created easily

It offers:
- Support for Heterogeneous and Temporal graphs
- Mini-batch loaders
- Multi GPU-support
- DataPipe support
- Distributed graph learning via Quiver
- A large number of common benchmark datasets
- The GraphGym experiment manager

[PyTorch Geometric documentation](https://pytorch-geometric.readthedocs.io/en/latest/)

### [Introduction by example](https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html)

Each network is described by an instance of _torch_geometric.data.Data_, which includes:
- _data.**x**_: **node feature matrix**, with shape [num_nodes, num_node_features]
- _data.**edge_index**_: **edge list in COO format**, with shape [2, num_edges] and type _torch.long_
- _data.**edge_attr**_: *edge feature matrix*, with shape [num_edges, num_edge_features]
- _data.**y**_: target to train against (may have arbitrary shape)

In [None]:
import torch

from torch_geometric.data import Data

#### Let's build our Data object

#### Node features
We do not have any node feature

We can use a constant for each node, e.g., $1$

In [None]:
x = torch.ones(size=(g.num_vertices(), 1),
               dtype=torch.float32
    )

display(x.shape)

display(x.T)

#### Connectivity matrix and edge attributes

In [None]:
edge_index = torch.empty(size=(2, g.num_edges()),
                         dtype=torch.long,
)

display("edge_index", edge_index.shape)

edge_attr = torch.empty(size=(g.num_edges(),),
                        dtype=torch.float32,
)
display("edge_attr", edge_attr.shape)

In [None]:
for i, (source, target, weight) in enumerate(g.iter_edges(eprops=[g.edge_properties["weight"]])):
    edge_index[0, i] = source
    edge_index[1, i] = target

    edge_attr[i] = weight


display("edge_index", edge_index[:, :10])

display("edge_attr", edge_attr[:10])

#### Create the _Data_ instance

In [None]:
network_data = Data(
    x=x,
    edge_index=edge_index,
    edge_attr=edge_attr,
)

display(network_data)

### Create the train and test set

For link prediction, we need positive (existent) and negative (non-existent) edges

We can use the _RandomLinkSplit_ class, that does the _negative sampling_ for us

In [None]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(num_val=0,
                            num_test=0.2,
                            disjoint_train_ratio=0.2,
                            split_labels=False,
                            add_negative_train_samples=True,
                            neg_sampling_ratio=1.0,
                            is_undirected=False,
)
train_data, _, test_data = transform(network_data)

In [None]:
display(train_data)

In [None]:
display(test_data)

#### Create the model

Model architecture:
- 2x GINE convolutional layers
- 1x Multi-Layer Perceptron (MLP)

The GINE layers will compute the ***node embedding***

We can build the ***edge embedding*** by, e.g., concatenating the source and target nodes' embedding

The MLP will take the edge embeddings and return a probability for each


In [None]:
from torch_geometric.nn import MLP

class MLP(MLP):

    def __getitem__(self, item):
        return self.lins[item]


In [None]:
import torch.nn.functional as F
from torch_geometric.nn import GINEConv
from torch.nn import Sequential, Linear, ELU

class GINEModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, edge_dim):
        super().__init__()
        self.conv1 = GINEConv(nn=MLP([in_channels, hidden_channels, hidden_channels]),
                             train_eps=False, edge_dim=edge_dim,) 

        self.conv2 = GINEConv(nn=MLP([hidden_channels, hidden_channels, out_channels]),
                             train_eps=False, edge_dim=edge_dim,)

        self.edge_regression = MLP(channel_list=[2 * out_channels, out_channels, 1],
                                   batch_norm=False, dropout=0.3)


    def forward(self, x, edge_index, target_edges):
        x = self.conv1(x=x, edge_index=edge_index, edge_attr=edge_attr)
        x = F.relu(x)
        x = self.conv2(x=x, edge_index=edge_index, edge_attr=edge_attr)
        x = F.relu(x)

        x = torch.hstack((
            x[target_edges[0, :]],
            x[target_edges[1, :]],
        ))
        x = self.edge_regression(x)
        
        return x

#### Let's create the model instance

In [None]:
model = GINEModel(in_channels=network_data.x.shape[1],
                 hidden_channels=20,
                 out_channels=20,
                 edge_dim=network_data.edge_attr.shape[0],
)

display(model)

#### Create the optimizer and loss instances

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.005, weight_decay=1e-4)
criterion = torch.nn.BCEWithLogitsLoss()
# criterion = torch.nn.MSELoss()

epoch = 0

#### Define the train function and train the model

In [None]:
def train():
    model.train()
    optimizer.zero_grad()

    edge_probability = model(train_data.x, train_data.edge_index, train_data.edge_label_index).squeeze()
    loss = criterion(edge_probability, train_data.edge_label)
    loss.backward()
    optimizer.step()
    
    return loss


#### Define the test function

In [None]:
@torch.no_grad()
def test():
    model.eval()

    edge_probability = model(test_data.x, test_data.edge_index, test_data.edge_label_index).squeeze()
    edge_probability = torch.sigmoid(edge_probability)    
    
    return edge_probability, test_data.edge_label


### Training and testing

In [None]:
from tqdm import tqdm

for _ in tqdm(range(2001)):
    loss = train()

    if epoch % 500 == 0:
        display(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

    epoch += 1

In [None]:
edge_probability, test_y = test()

### Evaluate the model performance
The [_torchmetrics_](https://torchmetrics.readthedocs.io/en/stable/) package provides many performance metrics for various tasks

It is inspired by _scikit-learn_'s _metrics_ subpackage

In [None]:
from torchmetrics import Accuracy, AUROC

accuracy = Accuracy(threshold=0.5)

auroc = AUROC()

display("Accuracy", accuracy(edge_probability, test_y.to(torch.int)).item())
display("AUROC", auroc(edge_probability, test_y.to(torch.int)).item())

<div class="text-center">
    <h1> Thank you!</h1>
</div>

<div class="text-center">
    <h1>Questions?</h1>
</div>