Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions dgf/src/analyse/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,18 @@ py_test(
],
)

py_library(
name = "topology_statistics",
srcs = ["topology_statistics.py"],
deps = [
":histogram",
"//dgf/src/data:in_memory_graph",
"//dgf/src/data:schema",
"//dgf/src/data:statistics",
# numpy dep,
],
)

py_library(
name = "print_schema",
srcs = ["print_schema.py"],
Expand Down Expand Up @@ -198,6 +210,20 @@ py_test(
],
)

py_test(
name = "topology_statistics_test",
srcs = ["topology_statistics_test.py"],
deps = [
":topology_statistics",
# absl/testing:absltest dep,
"//dgf/src/data:histogram",
"//dgf/src/data:statistics",
"//dgf/src/util:gen_test_graph",
"//dgf/src/util:test_util",
# numpy dep,
],
)

py_test(
name = "print_schema_test",
srcs = ["print_schema_test.py"],
Expand Down
198 changes: 198 additions & 0 deletions dgf/src/analyse/topology_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# Copyright 2022 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Compute topology statistics, in process, on InMemoryGraph."""

import dataclasses
from typing import Iterable, List
from dgf.src.analyse import histogram as analyse_hist
from dgf.src.data import in_memory_graph as in_memory_graph_lib
from dgf.src.data import schema as schema_lib
from dgf.src.data import statistics as statistics_lib
import numpy as np


@dataclasses.dataclass
class _NodeSetAccumulator:
node_counts: List[int]


@dataclasses.dataclass
class _EdgeSetAccumulator:
edge_counts: List[int]
in_degrees: List[np.ndarray]
out_degrees: List[np.ndarray]


def _get_num_nodes(nodeset: in_memory_graph_lib.InMemoryNodeSet) -> int:
if nodeset.num_nodes is not None:
return nodeset.num_nodes
if nodeset.features:
return next(iter(nodeset.features.values())).shape[0]
return 0


def topology_statistics(
graph: in_memory_graph_lib.InMemoryGraph,
schema: schema_lib.GraphSchema,
num_bins: int = 32,
) -> statistics_lib.GraphTopologyStatistics:
"""Computes topology statistics for a single InMemoryGraph.

Usage example:

```python
# Read a graph
graph, schema = dgf.io.read_graph("/tmp/my_graph")

# Compute the topology statistics
topo_stats = dgf.analyse.topology_statistics(graph, schema)

# Print the statistics (shows beautiful ASCII art histograms)
print(topo_stats)
```

Args:
graph: An in-memory graph.
schema: Schema of the graph.
num_bins: Number of bins to use for degree distribution histograms.

Returns:
GraphTopologyStatistics object.
"""
return topology_statistics_from_graphs(
graphs=[graph],
schema=schema,
num_bins=num_bins,
)


def topology_statistics_from_graphs(
graphs: Iterable[in_memory_graph_lib.InMemoryGraph],
schema: schema_lib.GraphSchema,
num_bins: int = 32,
) -> statistics_lib.GraphTopologyStatistics:
"""Computes topology statistics for a set of InMemoryGraphs.

Usage example:

```python
# Read graphs
graphs, schema = dgf.io.read_tfgnn_graphs("/my/data@10")

# Compute the topology statistics
topo_stats = dgf.analyse.topology_statistics_from_graphs(graphs, schema)

# Print the statistics
print(topo_stats)
```

Args:
graphs: An iterable of in-memory graphs.
schema: Schema of the graphs.
num_bins: Number of bins to use for histograms.

Returns:
GraphTopologyStatistics object.
"""
# Initialize accumulators
node_accumulators = {
name: _NodeSetAccumulator(node_counts=[]) for name in schema.node_sets
}
edge_accumulators = {
name: _EdgeSetAccumulator(edge_counts=[], in_degrees=[], out_degrees=[])
for name in schema.edge_sets
}

num_graphs = 0
for graph in graphs:
num_graphs += 1

# Collect node counts
for name in schema.node_sets:
nodeset = graph.node_sets[name]
node_accumulators[name].node_counts.append(_get_num_nodes(nodeset))

# Collect edge counts and degrees
for name in schema.edge_sets:
edgeset = graph.edge_sets[name]
edge_accumulators[name].edge_counts.append(edgeset.num_edges())

edge_schema = schema.edge_sets[name]
source_nodeset = graph.node_sets[edge_schema.source]
target_nodeset = graph.node_sets[edge_schema.target]

num_sources = _get_num_nodes(source_nodeset)
num_targets = _get_num_nodes(target_nodeset)

# Compute in-degrees for this graph
target_indices = edgeset.adjacency[1]
in_degrees = np.zeros(num_targets, dtype=int)
if target_indices.size > 0:
unique_targets, counts = np.unique(target_indices, return_counts=True)
in_degrees[unique_targets] = counts
edge_accumulators[name].in_degrees.append(in_degrees)

# Compute out-degrees for this graph
source_indices = edgeset.adjacency[0]
out_degrees = np.zeros(num_sources, dtype=int)
if source_indices.size > 0:
unique_sources, counts = np.unique(source_indices, return_counts=True)
out_degrees[unique_sources] = counts
edge_accumulators[name].out_degrees.append(out_degrees)

if num_graphs == 0:
raise ValueError("The input 'graphs' iterable was empty.")

# Build final statistics
node_sets_stats = {}
for name in node_accumulators:
acc = node_accumulators[name]
nodes_arr = np.array(acc.node_counts)
node_sets_stats[name] = statistics_lib.NodeSetTopologyStatistics(
num_nodes=analyse_hist.make_histogram(
nodes_arr, num_bins=num_bins, log_scale=True, is_integer=True
)
)

edge_sets_stats = {}
for name in edge_accumulators:
acc = edge_accumulators[name]
edges_arr = np.array(acc.edge_counts)
num_edges_hist = analyse_hist.make_histogram(
edges_arr, num_bins=num_bins, log_scale=True, is_integer=True
)

# Aggregate degrees by concatenating arrays from all graphs
in_degrees_flat = np.concatenate(acc.in_degrees)
out_degrees_flat = np.concatenate(acc.out_degrees)

in_degree_dist = analyse_hist.make_histogram(
in_degrees_flat, num_bins=num_bins, log_scale=True, is_integer=True
)
out_degree_dist = analyse_hist.make_histogram(
out_degrees_flat, num_bins=num_bins, log_scale=True, is_integer=True
)

edge_sets_stats[name] = statistics_lib.EdgeSetTopologyStatistics(
num_edges=num_edges_hist,
in_degree_distribution=in_degree_dist,
out_degree_distribution=out_degree_dist,
)

return statistics_lib.GraphTopologyStatistics(
node_sets=node_sets_stats,
edge_sets=edge_sets_stats,
num_graphs=num_graphs,
)
121 changes: 121 additions & 0 deletions dgf/src/analyse/topology_statistics_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright 2022 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for topology_statistics."""

from absl.testing import absltest
from dgf.src.analyse import topology_statistics as topo_stats_lib
from dgf.src.data import histogram as histogram_lib
from dgf.src.data import statistics as statistics_lib
from dgf.src.util import gen_test_graph
from dgf.src.util import test_util

test_util.disable_diff_truncation()


class TopologyStatisticsTest(absltest.TestCase):

def test_topology_statistics(self):
schema = gen_test_graph.generate_schema()
graph = gen_test_graph.generate_in_memory_graph()

stats = topo_stats_lib.topology_statistics(graph, schema, num_bins=2)

expected_stats = statistics_lib.GraphTopologyStatistics(
node_sets={
"n1": statistics_lib.NodeSetTopologyStatistics(
num_nodes=histogram_lib.Histogram(values=[1.0], bins=[2.0, 2.0])
),
"n2": statistics_lib.NodeSetTopologyStatistics(
num_nodes=histogram_lib.Histogram(values=[1.0], bins=[2.0, 2.0])
),
},
edge_sets={
"e1": statistics_lib.EdgeSetTopologyStatistics(
num_edges=histogram_lib.Histogram(
values=[1.0], bins=[2.0, 2.0]
),
in_degree_distribution=histogram_lib.Histogram(
values=[2.0], bins=[1.0, 1.0]
),
out_degree_distribution=histogram_lib.Histogram(
values=[1.0, 1.0], bins=[0.0, 1.0, 2.0]
),
),
"e2": statistics_lib.EdgeSetTopologyStatistics(
num_edges=histogram_lib.Histogram(
values=[1.0], bins=[2.0, 2.0]
),
in_degree_distribution=histogram_lib.Histogram(
values=[2.0], bins=[1.0, 1.0]
),
out_degree_distribution=histogram_lib.Histogram(
values=[1.0, 1.0], bins=[0.0, 1.0, 2.0]
),
),
},
)

test_util.assert_are_equal(self, stats, expected_stats)

def test_topology_statistics_from_graphs(self):
schema = gen_test_graph.generate_schema()
graph1 = gen_test_graph.generate_in_memory_graph()
graph2 = gen_test_graph.generate_in_memory_graph()

stats = topo_stats_lib.topology_statistics_from_graphs(
[graph1, graph2], schema, num_bins=2
)

expected_stats = statistics_lib.GraphTopologyStatistics(
node_sets={
"n1": statistics_lib.NodeSetTopologyStatistics(
num_nodes=histogram_lib.Histogram(values=[2.0], bins=[2.0, 2.0])
),
"n2": statistics_lib.NodeSetTopologyStatistics(
num_nodes=histogram_lib.Histogram(values=[2.0], bins=[2.0, 2.0])
),
},
edge_sets={
"e1": statistics_lib.EdgeSetTopologyStatistics(
num_edges=histogram_lib.Histogram(
values=[2.0], bins=[2.0, 2.0]
),
in_degree_distribution=histogram_lib.Histogram(
values=[4.0], bins=[1.0, 1.0]
),
out_degree_distribution=histogram_lib.Histogram(
values=[2.0, 2.0], bins=[0.0, 1.0, 2.0]
),
),
"e2": statistics_lib.EdgeSetTopologyStatistics(
num_edges=histogram_lib.Histogram(
values=[2.0], bins=[2.0, 2.0]
),
in_degree_distribution=histogram_lib.Histogram(
values=[4.0], bins=[1.0, 1.0]
),
out_degree_distribution=histogram_lib.Histogram(
values=[2.0, 2.0], bins=[0.0, 1.0, 2.0]
),
),
},
num_graphs=2,
)

test_util.assert_are_equal(self, stats, expected_stats)


if __name__ == "__main__":
absltest.main()
1 change: 1 addition & 0 deletions dgf/src/api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ py_library(
"//dgf/src/analyse:padding",
"//dgf/src/analyse:print_schema",
"//dgf/src/analyse:schema",
"//dgf/src/analyse:topology_statistics",
"//dgf/src/analyse/reports:data_model",
"//dgf/src/analyse/reports:reporter",
"//dgf/src/analyse/topology:global_graph_topology",
Expand Down
2 changes: 2 additions & 0 deletions dgf/src/api/analyse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from dgf.src.analyse.padding import padding_from_graph_generator
from dgf.src.analyse.in_process_feature_statistics import feature_statistics_from_graphs
from dgf.src.analyse.in_process_feature_statistics import feature_statistics
from dgf.src.analyse.topology_statistics import topology_statistics
from dgf.src.analyse.topology_statistics import topology_statistics_from_graphs
from dgf.src.analyse.histogram import make_histogram

# TODO: Use third_party/py/dgf/src/api/print.py version instead.
Expand Down