Company Movie Night

Cloistered Monkey

2019-04-11 13:04

The Departure

This is a look at working with networks using networkx. Our scene - eight employees are trying to choose three movies to watch for company movie nights. We have two sources of data - the candidate movies and the relationship between pairs of employees. The relationships are on a scale from -100 to 100, with -100 being the strongest of enemies and 100 meaning they are best of friends. Zero either means they have no relationship (don't interact) or are indifferent about the other person.

Imports

From Python

from argparse import Namespace
from functools import partial
from pathlib import Path
import copy
import os

From PyPi

from dotenv import load_dotenv
from bokeh.models import (
    BoxSelectTool,
    Circle,
    HoverTool, 
    MultiLine,
    PanTool,
    Range1d,
    TapTool,
    WheelZoomTool,
)
from bokeh.models import Plot as BokehPlot
from bokeh.models.graphs import (from_networkx, 
                                 EdgesAndLinkedNodes, 
                                 NodesAndLinkedEdges)
from bokeh.palettes import RdBu, Spectral4
from bokeh.transform import linear_cmap
import holoviews
from holoviews import dim
import hvplot.pandas
import networkx
import pandas
import numpy
from networkx.algorithms import bipartite

My Stuff

from graeae.visualization import EmbedBokeh, EmbedHoloview

Set Up

Load Dotenv

load_dotenv(".env", override=True)

The Plotting

SLUG = 'company-movie-night/'
OUTPUT = Path("../../files/posts/networks/" + SLUG)
Embed = partial(EmbedHoloview, folder_path=OUTPUT)
EmbedB = partial(EmbedBokeh, folder_path=OUTPUT)
holoviews.extension("bokeh")

Plot = Namespace(
    node_size=15,
    edge_alpha=0.8,
    edge_color="#CCCCCC",
    edge_width=5,
    width=800,
    height=800,
    fontsize=18,
    padding=dict(x=(-1, 1), y=(-1, 1))
)

The Initiation

The Data

This Is the Set Of Employee-Relationships

employee_relationships_path = Path(os.environ.get("EMPLOYEE_RELATIONSHIPS"))
relationships_data = pandas.read_csv(
    employee_relationships_path, 
    delimiter="\t", 
    header=None,
    names="employee_1 employee_2 relationship".split())

table = holoviews.Table(relationships_data).opts(height=Plot.height)
Embed(plot=table, file_name="relationships_data")()

employees = set(relationships_data.employee_1.unique()) | set(relationships_data.employee_2.unique())
print(employees)

{'Pablo', 'Georgia', 'Andy', 'Claude', 'Vincent', 'Joan', 'Lee', 'Frida'}

print(len(employees))

print(len(relationships_data))

We have eight employees and twenty-eight links. Is this a fully connected graph? The handshake problem says that the amount of links in a fully-connected network is:

\[ links = \frac{n(n-1)}{2} \]

print(len(employees) * (len(employees) - 1)/2)

28.0

It looks like our relationships data creates a fully-connected network (unless there is a duplicate which would be an error).

These Are the Movie Choices

employee_movies_path = Path(os.environ.get("EMPLOYEE_MOVIE_CHOICES"))
movies_data = pandas.read_csv(
    employee_movies_path, 
    delimiter="\t", 
    header=None,
    skiprows=1,
    names="employee movie".split())
print(f"Employee To Movie Edges: {len(movies_data)}")
print(movies_data.head())

Employee To Movie Edges: 24
  employee                            movie
0     Andy                         Anaconda
1     Andy                       Mean Girls
2     Andy                       The Matrix
3   Claude                         Anaconda
4   Claude  Monty Python and the Holy Grail

movies = set(movies_data.movie.unique())
for movie in sorted(movies):
    print(f" - {movie}")

Anaconda
Forrest Gump
Kung Fu Panda
Mean Girls
Monty Python and the Holy Grail
Snakes on a Plane
The Dark Knight
The Godfather
The Matrix
The Shawshank Redemption
The Social Network

The eight employees chose 11 movies between them.

Converting the DataFrames to Graphs

The Relationship Graph

relationship_graph = networkx.from_pandas_edgelist(relationships_data, 
                                                   "employee_1", "employee_2", 
                                                   edge_attr="relationship")
for index, row in relationships_data.sample(5).iterrows():
    assert relationship_graph[row["employee_1"]][row["employee_2"]]["relationship"] == row["relationship"]

expected_edges = len(relationships_data)
expected_nodes = len(employees)
print("Expected Edges: {}".format(expected_edges))
print("Expected Nodes: {}".format(expected_nodes))
assert expected_nodes == len(relationship_graph.nodes)
assert expected_edges == len(relationship_graph.edges)

Expected Edges: 28
Expected Nodes: 8

The Movie Graph

movie_graph = networkx.from_pandas_edgelist(movies_data, "employee", "movie")

for index, row in movies_data.iterrows():
    movie_graph[row.employee][row.movie]["employee"] = row.employee
    movie_graph[row.employee][row.movie]["movie"] = row.movie

for employee in movies_data.employee:    
    movie_graph.nodes[employee]["type"] = "employee"
for movie in movies_data.movie:
    movie_graph.nodes[movie]["type"] = "movie"

Plotting

def graph_plot(graph: networkx.Graph, 
               title:str,
               file_name: str,
               hover: HoverTool,
               layout=networkx.circular_layout) -> None:
    """Plot the graph in bokeh

    Args:
     graph: the graph to plot
     layout: function to layout the plot
     title: title for the plot
     hover: defined hover tool
     file_name: name to save the bokeh file (without extension)
    """
    plot = BokehPlot(plot_width=Plot.width,
                     plot_height=Plot.height,
                     x_range=Range1d(-1, 1),
                     y_range=Range1d(-1, 1)
    )

    plot.title.text = title
    plot.title.text_font_size = f"{Plot.fontsize}pt"
    plot.add_tools(hover, TapTool(), BoxSelectTool(), PanTool(), WheelZoomTool())

    renderer = from_networkx(relationship_graph, layout, 
                             scale=1, center=(0,0))
    renderer.node_renderer.glyph = Circle(size=Plot.node_size, 
                                          fill_color=Spectral4[0])
    renderer.node_renderer.selection_glyph = Circle(size=Plot.node_size, 
                                                    fill_color=Spectral4[2])
    renderer.node_renderer.hover_glyph = Circle(size=Plot.node_size, 
                                                fill_color=Spectral4[1])

    color_map = linear_cmap(field_name="relationship", 
                            palette=RdBu[11], 
                            low=100, high=-100)
    renderer.edge_renderer.glyph = MultiLine(line_color=color_map, 
                                             line_alpha=0.5,
                                             line_width=3)
    renderer.edge_renderer.selection_glyph = MultiLine(line_color=color_map, 
                                                       line_width=Plot.edge_width)
    renderer.edge_renderer.hover_glyph = MultiLine(line_color=color_map, 
                                                   line_width=Plot.edge_width)
    renderer.selection_policy = NodesAndLinkedEdges()
    renderer.inspection_policy = EdgesAndLinkedNodes()
    plot.renderers.append(renderer)
    EmbedB(plot=plot, file_name=file_name)()
    return

class RelationshipGraphPlot:
    """Plots a graph and keeps the parts so you can inspect them

    Args:
     graph: the graph to plot
     layout: function to layout the plot
     title: title for the plot
     hover: defined hover tool
     settings: namespace with the plot settings
     file_name: name to save the bokeh file (without extension)
    """
    def __init__(self, graph: networkx.Graph, 
                 title:str,
                 file_name: str,
                 hover: HoverTool,
                 settings: Namespace=Plot,
                 layout=networkx.circular_layout) -> None:
        self.graph = graph
        self.title = title
        self.file_name = file_name
        self.hover = hover
        self.settings = settings
        self.layout = layout
        self._tap_tool = None
        self._box_select_tool = None
        self._pan_tool = None
        self._wheel_zoom_tool = None
        self._plot = None
        self._renderer = None
        self._color_map = None
        return

    @property
    def tap_tool(self) -> TapTool:
        if self._tap_tool is None:
            self._tap_tool = TapTool()
        return self._tap_tool

    @property
    def box_select_tool(self) -> BoxSelectTool:
        if self._box_select_tool is None:
            self._box_select_tool = BoxSelectTool()
        return self._box_select_tool

    @property
    def pan_tool(self) -> PanTool:
        if self._pan_tool is None:
            self._pan_tool = PanTool()
        return self._pan_tool

    @property
    def wheel_zoom_tool(self) -> WheelZoomTool:
        if self._wheel_zoom_tool is None:
            self._wheel_zoom_tool = WheelZoomTool()
        return self._wheel_zoom_tool

    @property
    def plot(self) -> BokehPlot:
        if self._plot is None:
            self._plot = BokehPlot(plot_width=self.settings.width,
                                   plot_height=self.settings.height,
                                   x_range=Range1d(-1, 1),
                                   y_range=Range1d(-1, 1)
            )

            self._plot.title.text = self.title
            self._plot.title.text_font_size = f"{Plot.fontsize}pt"
            self._plot.add_tools(self.hover, 
                                 self.tap_tool, 
                                 self.box_select_tool, 
                                 self.pan_tool,
                                 self.wheel_zoom_tool)
            self._plot.renderers.append(self.renderer)
        return self._plot

    @property
    def color_map(self):
        if self._color_map is None:
            self._color_map = linear_cmap(field_name="relationship", 
                                          palette=RdBu[11], 
                                          low=100, high=-100)
        return self._color_map

    @property
    def renderer(self):
        if self._renderer is None:
            self._renderer = from_networkx(self.graph, self.layout, 
                                          scale=1, center=(0,0))
            self._renderer.node_renderer.glyph = Circle(
                size=Plot.node_size, 
                fill_color=Spectral4[0])
            self._renderer.node_renderer.selection_glyph = Circle(
                size=Plot.node_size, 
                fill_color=Spectral4[2])
            self._renderer.node_renderer.hover_glyph = Circle(
                size=Plot.node_size, 
                fill_color=Spectral4[1])

            self._renderer.edge_renderer.glyph = MultiLine(
                line_color=self.color_map, 
                line_alpha=0.5,
                line_width=3)
            self._renderer.edge_renderer.selection_glyph = MultiLine(
                line_color=self.color_map, 
                line_width=Plot.edge_width)
            self._renderer.edge_renderer.hover_glyph = MultiLine(
                line_color=self._color_map,
                line_width=Plot.edge_width)
            self._renderer.selection_policy = NodesAndLinkedEdges()
            self._renderer.inspection_policy = EdgesAndLinkedNodes()
        return self._renderer

    def __call__(self) -> None:
        EmbedB(plot=self.plot, file_name=self.file_name)()
        return

The Employee Relationship Plot

HoloViews

The employee relationship graph consists of employees as nodes and their relationshp-level as weights on the edges.

plot = holoviews.Graph.from_networkx(relationship_graph,
                                     networkx.circular_layout).opts(
                                         cmap="Set1",
                                         fontsize=Plot.fontsize,
                                         width=800,
                                         height=800,
                                         title="Company Relationship Graph",
                                         xaxis=None, yaxis=None).options(
                                             edge_color_index="relationship", 
                                             edge_cmap="Spectral").redim.range(**Plot.padding)
renderer = holoviews.render(plot)
renderer.renderers[-1].selection_policy = EdgesAndLinkedNodes()
EmbedB(plot=renderer, file_name="employee_relationships")()

Bokeh

This is the same plot using bokeh directly instead of holoviews. I wanted both the nodes and edges to trigger the HoverTool but HoloViews doesn't support this. There might be a way to add it later, but their documentation is so opaque that I decided it wasn't worth it to keep trying to figure it out, since bokeh isn't that hard to use (although their documentation isn't the best either).

Since bokeh is so verbose I'm going to step through this instead of putting it into one block.

Hover Tool

When the EdgesAndLinkedNodes class is used only the edge data is available to the hovertool (or at least I couldn't figure out how to make the Node attributes work). So these have to be available to it. You can see what's available once you've created the graph renderer (the output of from_networkx) by looking at one of the data attributes

renderer.edge_renderer.data_source.data.keys()

Which in this case returned this.

dict_keys(['relationship', 'start', 'end'])

relationship was a data-attribute I added through networkx, something else (presumably bokeh) created the start and end.

hover = HoverTool(
    tooltips=[
        ("Employee", "@start"),
        ("Employee", "@end"),
        ("Relationship", "@relationship"),
    ]
)

The Plot

This is the bokeh plot (I don't know how it differs from a figure). It's normally called Plot but I already used that name for my settings object so I called it BokehPlot.

plot = BokehPlot(plot_width=Plot.width,
                 plot_height=Plot.height,
                 x_range=Range1d(-1, 1),
                 y_range=Range1d(-1, 1)
)

plot.title.text = "Company Relationships"
plot.title.text_font_size = f"{Plot.fontsize}pt"
plot.add_tools(hover, TapTool(), BoxSelectTool(), PanTool(), WheelZoomTool())

The Graph Renderer

This part converts the networkx Graph into a bokeh Graph. This is what I was referring to earlier when I talked about inspecting the renderer to look at the available edge attributes.

renderer = from_networkx(relationship_graph, networkx.circular_layout, 
                         scale=1, center=(0,0))

Nodes

You have to set up the shapes for the nodes - I think - there might be defaults but the few examples I found set it up. It's probably not a bad idea in any case. The Spectral4 object is a list of four hex-colors. Here's the ones I used.

Object	Index	Color
glyph	0	Medium Blue
selection_glyph	2	Orange
hover_glyph	1	Pastel Green

renderer.node_renderer.glyph = Circle(size=Plot.node_size, fill_color=Spectral4[0])
renderer.node_renderer.selection_glyph = Circle(size=Plot.node_size, fill_color=Spectral4[2])
renderer.node_renderer.hover_glyph = Circle(size=Plot.node_size, fill_color=Spectral4[1])

Color Map

The linear_cmap maps a range of values to a palette of colors. In this case I'm mapping the relationship values to the red-blue palette (RdBu). Two things to note:

I chose a red-blue palette with 11 values because the odd-number puts white at the center (it goes from blue to white to red)
Althouh the name suggests a palette form red to blue it goes from blue to red so I had to make -100 the 'high' value so red would be a bad relationship.

color_map = linear_cmap(field_name="relationship", palette=RdBu[11], low=100, high=-100)

The Edges

Like the nodes you define the edges for the plot. This is where we get to use the color-map to make the edges match the relationship between the employees.

renderer.edge_renderer.glyph = MultiLine(line_color=color_map, 
                                         line_alpha=0.5,
                                         line_width=3)
renderer.edge_renderer.selection_glyph = MultiLine(line_color=color_map, 
                                                   line_width=Plot.edge_width)
renderer.edge_renderer.hover_glyph = MultiLine(line_color=color_map, 
                                               line_width=Plot.edge_width)

The Selection and Inspection Policies

This was the reason for doing it in bokeh in the first place. Adding these two lines makes both the edge and attached notes highlight when selected or hovered over.

renderer.selection_policy = NodesAndLinkedEdges()
renderer.inspection_policy = EdgesAndLinkedNodes()

Put It All Together

Now we just add the graph-renderer to the plot and have bokeh convert it to JavaScript and HTML.

plot.renderers.append(renderer)
EmbedB(plot=plot, file_name="company_relationships_bokeh")()

It looks like Andy might have some kind of personality problem (maybe he's the boss), while Georgia and Claude are unusually close.

Spring loaded

Bokeh raises an error if you try to re-use the hover-tool for some reason so I had to make a copy.

hover = HoverTool(
    tooltips=[
        ("Employee", "@start"),
        ("Employee", "@end"),
        ("Relationship", "@relationship"),
    ]
)

graph_plot(relationship_graph, 
           "Company Relationships", 
           "company_relationships_spring", 
           hover,
           networkx.spring_layout)

This didn't produce as interesting a result as I thought.

The Movie Plot

movie_hover = HoverTool(
    tooltips = [
        ("Employee", "@employee"),
        ("Movie", "@movie"),
    ]

)
plot = holoviews.Graph.from_networkx(movie_graph,
                                     networkx.circular_layout).opts(
                                         node_color=dim("type"),
                                         cmap="Set1",
                                         fontsize=Plot.fontsize,
                                         width=Plot.width,
                                         height=Plot.height,
                                         tools=[movie_hover, TapTool()],
                                         title="Company Movies Graph",
                                         xaxis=None, yaxis=None).options(
                                             inspection_policy="edges",
                                             edge_cmap="Spectral").redim.range(**Plot.padding)
Embed(plot=plot, file_name="company_movies_circle")()

The Blue nodes are employees and the red nodes are movies.

Question 2

Using the graph from the previous question, add nodes attributes named `'type'` where movies have the value `'movie'` and employees have the value `'employee'` and return that graph.

This function should return a networkx graph with node attributes `{'type': 'movie'}` or `{'type': 'employee'}`

def answer_two():
    """Adds 'type' to nodes from movie-graph

    Returns:
     Graph: answer_one with 'type' attribute added (employee or movie)
    """
    graph = answer_one()
    new_graph = networkx.Graph()
    nodes = graph.nodes()
    employee_nodes = [node for node in nodes if node in employees]
    movie_nodes = [node for node in nodes if node in movies]
    new_graph.add_nodes_from(employee_nodes, bipartite=0, type='employee')
    new_graph.add_nodes_from(movie_nodes, bipartite=1, type="movie")
    new_graph.add_edges_from(graph.edges())
    return new_graph

two = answer_two()
two.nodes(data=True)

plot_graph(two)

Question 3

Find a weighted projection of the graph from `answer_two` which tells us how many movies different pairs of employees have in common.

This function should return a weighted projected graph.

def answer_three():
    graph = answer_two()
    assert networkx.is_bipartite(graph)
    return bipartite.weighted_projected_graph(graph, employees)

three = answer_three()
plot_graph(three)

Question 4

Suppose you'd like to find out if people that have a high relationship score also like the same types of movies.

Find the Pearson correlation ( using `DataFrame.corr()` ) between employee relationship scores and the number of movies they have in common. If two employees have no movies in common it should be treated as a 0, not a missing value, and should be included in the correlation calculation.

This function should return a float.

def answer_four():
    """calculates the pearson correlation for data

    Returns:
     float: Pearson correlation for weight and relationship_score
    """
    three = answer_three()
    relationships = pandas.read_table(
        "Employee_Relationships.txt",
        names="employee_left employee_right relationship_score".split())
    relationships["employees"] = relationships.apply(
        lambda row: tuple(sorted((row["employee_left"],
                                  row['employee_right']))), axis=1)

    weights = pandas.DataFrame(
        three.edges(data=True),
        columns="employee_left employee_right weight".split())
    weights["weight"] = weights.weight.map(lambda row: row["weight"])
    weights["employees"] = weights.apply(lambda row: tuple(sorted(
        (row["employee_left"],
         row["employee_right"]))),
                                         axis=1)

    joined = pandas.merge(relationships, weights, how="outer", 
                          on=['employees'])
    assert len(joined) == len(relationships)
    joined['weight'] = joined["weight"].fillna(0)

    data = joined[["relationship_score", "weight"]]
    correlation = data.corr()
    return correlation.relationship_score.weight

print(answer_four())