From 30b51db2b28b7dbfcf18308cd6e5553f1fae1f35 Mon Sep 17 00:00:00 2001 From: Mathieu Guillame-Bert Date: Fri, 12 Jun 2026 01:40:33 -0700 Subject: [PATCH] Internal change PiperOrigin-RevId: 931009295 --- dgf/src/analyse/BUILD | 26 +++ dgf/src/analyse/topology_statistics.py | 198 ++++++++++++++++++++ dgf/src/analyse/topology_statistics_test.py | 121 ++++++++++++ dgf/src/api/BUILD | 1 + dgf/src/api/analyse.py | 2 + 5 files changed, 348 insertions(+) create mode 100644 dgf/src/analyse/topology_statistics.py create mode 100644 dgf/src/analyse/topology_statistics_test.py diff --git a/dgf/src/analyse/BUILD b/dgf/src/analyse/BUILD index 6866dad..c1dd035 100644 --- a/dgf/src/analyse/BUILD +++ b/dgf/src/analyse/BUILD @@ -98,6 +98,18 @@ py_test( ], ) +py_library( + name = "topology_statistics", + srcs = ["topology_statistics.py"], + deps = [ + ":histogram", + "//dgf/src/data:in_memory_graph", + "//dgf/src/data:schema", + "//dgf/src/data:statistics", + # numpy dep, + ], +) + py_library( name = "print_schema", srcs = ["print_schema.py"], @@ -198,6 +210,20 @@ py_test( ], ) +py_test( + name = "topology_statistics_test", + srcs = ["topology_statistics_test.py"], + deps = [ + ":topology_statistics", + # absl/testing:absltest dep, + "//dgf/src/data:histogram", + "//dgf/src/data:statistics", + "//dgf/src/util:gen_test_graph", + "//dgf/src/util:test_util", + # numpy dep, + ], +) + py_test( name = "print_schema_test", srcs = ["print_schema_test.py"], diff --git a/dgf/src/analyse/topology_statistics.py b/dgf/src/analyse/topology_statistics.py new file mode 100644 index 0000000..1189e26 --- /dev/null +++ b/dgf/src/analyse/topology_statistics.py @@ -0,0 +1,198 @@ +# Copyright 2022 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Compute topology statistics, in process, on InMemoryGraph.""" + +import dataclasses +from typing import Iterable, List +from dgf.src.analyse import histogram as analyse_hist +from dgf.src.data import in_memory_graph as in_memory_graph_lib +from dgf.src.data import schema as schema_lib +from dgf.src.data import statistics as statistics_lib +import numpy as np + + +@dataclasses.dataclass +class _NodeSetAccumulator: + node_counts: List[int] + + +@dataclasses.dataclass +class _EdgeSetAccumulator: + edge_counts: List[int] + in_degrees: List[np.ndarray] + out_degrees: List[np.ndarray] + + +def _get_num_nodes(nodeset: in_memory_graph_lib.InMemoryNodeSet) -> int: + if nodeset.num_nodes is not None: + return nodeset.num_nodes + if nodeset.features: + return next(iter(nodeset.features.values())).shape[0] + return 0 + + +def topology_statistics( + graph: in_memory_graph_lib.InMemoryGraph, + schema: schema_lib.GraphSchema, + num_bins: int = 32, +) -> statistics_lib.GraphTopologyStatistics: + """Computes topology statistics for a single InMemoryGraph. + + Usage example: + + ```python + # Read a graph + graph, schema = dgf.io.read_graph("/tmp/my_graph") + + # Compute the topology statistics + topo_stats = dgf.analyse.topology_statistics(graph, schema) + + # Print the statistics (shows beautiful ASCII art histograms) + print(topo_stats) + ``` + + Args: + graph: An in-memory graph. + schema: Schema of the graph. + num_bins: Number of bins to use for degree distribution histograms. + + Returns: + GraphTopologyStatistics object. + """ + return topology_statistics_from_graphs( + graphs=[graph], + schema=schema, + num_bins=num_bins, + ) + + +def topology_statistics_from_graphs( + graphs: Iterable[in_memory_graph_lib.InMemoryGraph], + schema: schema_lib.GraphSchema, + num_bins: int = 32, +) -> statistics_lib.GraphTopologyStatistics: + """Computes topology statistics for a set of InMemoryGraphs. + + Usage example: + + ```python + # Read graphs + graphs, schema = dgf.io.read_tfgnn_graphs("/my/data@10") + + # Compute the topology statistics + topo_stats = dgf.analyse.topology_statistics_from_graphs(graphs, schema) + + # Print the statistics + print(topo_stats) + ``` + + Args: + graphs: An iterable of in-memory graphs. + schema: Schema of the graphs. + num_bins: Number of bins to use for histograms. + + Returns: + GraphTopologyStatistics object. + """ + # Initialize accumulators + node_accumulators = { + name: _NodeSetAccumulator(node_counts=[]) for name in schema.node_sets + } + edge_accumulators = { + name: _EdgeSetAccumulator(edge_counts=[], in_degrees=[], out_degrees=[]) + for name in schema.edge_sets + } + + num_graphs = 0 + for graph in graphs: + num_graphs += 1 + + # Collect node counts + for name in schema.node_sets: + nodeset = graph.node_sets[name] + node_accumulators[name].node_counts.append(_get_num_nodes(nodeset)) + + # Collect edge counts and degrees + for name in schema.edge_sets: + edgeset = graph.edge_sets[name] + edge_accumulators[name].edge_counts.append(edgeset.num_edges()) + + edge_schema = schema.edge_sets[name] + source_nodeset = graph.node_sets[edge_schema.source] + target_nodeset = graph.node_sets[edge_schema.target] + + num_sources = _get_num_nodes(source_nodeset) + num_targets = _get_num_nodes(target_nodeset) + + # Compute in-degrees for this graph + target_indices = edgeset.adjacency[1] + in_degrees = np.zeros(num_targets, dtype=int) + if target_indices.size > 0: + unique_targets, counts = np.unique(target_indices, return_counts=True) + in_degrees[unique_targets] = counts + edge_accumulators[name].in_degrees.append(in_degrees) + + # Compute out-degrees for this graph + source_indices = edgeset.adjacency[0] + out_degrees = np.zeros(num_sources, dtype=int) + if source_indices.size > 0: + unique_sources, counts = np.unique(source_indices, return_counts=True) + out_degrees[unique_sources] = counts + edge_accumulators[name].out_degrees.append(out_degrees) + + if num_graphs == 0: + raise ValueError("The input 'graphs' iterable was empty.") + + # Build final statistics + node_sets_stats = {} + for name in node_accumulators: + acc = node_accumulators[name] + nodes_arr = np.array(acc.node_counts) + node_sets_stats[name] = statistics_lib.NodeSetTopologyStatistics( + num_nodes=analyse_hist.make_histogram( + nodes_arr, num_bins=num_bins, log_scale=True, is_integer=True + ) + ) + + edge_sets_stats = {} + for name in edge_accumulators: + acc = edge_accumulators[name] + edges_arr = np.array(acc.edge_counts) + num_edges_hist = analyse_hist.make_histogram( + edges_arr, num_bins=num_bins, log_scale=True, is_integer=True + ) + + # Aggregate degrees by concatenating arrays from all graphs + in_degrees_flat = np.concatenate(acc.in_degrees) + out_degrees_flat = np.concatenate(acc.out_degrees) + + in_degree_dist = analyse_hist.make_histogram( + in_degrees_flat, num_bins=num_bins, log_scale=True, is_integer=True + ) + out_degree_dist = analyse_hist.make_histogram( + out_degrees_flat, num_bins=num_bins, log_scale=True, is_integer=True + ) + + edge_sets_stats[name] = statistics_lib.EdgeSetTopologyStatistics( + num_edges=num_edges_hist, + in_degree_distribution=in_degree_dist, + out_degree_distribution=out_degree_dist, + ) + + return statistics_lib.GraphTopologyStatistics( + node_sets=node_sets_stats, + edge_sets=edge_sets_stats, + num_graphs=num_graphs, + ) diff --git a/dgf/src/analyse/topology_statistics_test.py b/dgf/src/analyse/topology_statistics_test.py new file mode 100644 index 0000000..ee758f4 --- /dev/null +++ b/dgf/src/analyse/topology_statistics_test.py @@ -0,0 +1,121 @@ +# Copyright 2022 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for topology_statistics.""" + +from absl.testing import absltest +from dgf.src.analyse import topology_statistics as topo_stats_lib +from dgf.src.data import histogram as histogram_lib +from dgf.src.data import statistics as statistics_lib +from dgf.src.util import gen_test_graph +from dgf.src.util import test_util + +test_util.disable_diff_truncation() + + +class TopologyStatisticsTest(absltest.TestCase): + + def test_topology_statistics(self): + schema = gen_test_graph.generate_schema() + graph = gen_test_graph.generate_in_memory_graph() + + stats = topo_stats_lib.topology_statistics(graph, schema, num_bins=2) + + expected_stats = statistics_lib.GraphTopologyStatistics( + node_sets={ + "n1": statistics_lib.NodeSetTopologyStatistics( + num_nodes=histogram_lib.Histogram(values=[1.0], bins=[2.0, 2.0]) + ), + "n2": statistics_lib.NodeSetTopologyStatistics( + num_nodes=histogram_lib.Histogram(values=[1.0], bins=[2.0, 2.0]) + ), + }, + edge_sets={ + "e1": statistics_lib.EdgeSetTopologyStatistics( + num_edges=histogram_lib.Histogram( + values=[1.0], bins=[2.0, 2.0] + ), + in_degree_distribution=histogram_lib.Histogram( + values=[2.0], bins=[1.0, 1.0] + ), + out_degree_distribution=histogram_lib.Histogram( + values=[1.0, 1.0], bins=[0.0, 1.0, 2.0] + ), + ), + "e2": statistics_lib.EdgeSetTopologyStatistics( + num_edges=histogram_lib.Histogram( + values=[1.0], bins=[2.0, 2.0] + ), + in_degree_distribution=histogram_lib.Histogram( + values=[2.0], bins=[1.0, 1.0] + ), + out_degree_distribution=histogram_lib.Histogram( + values=[1.0, 1.0], bins=[0.0, 1.0, 2.0] + ), + ), + }, + ) + + test_util.assert_are_equal(self, stats, expected_stats) + + def test_topology_statistics_from_graphs(self): + schema = gen_test_graph.generate_schema() + graph1 = gen_test_graph.generate_in_memory_graph() + graph2 = gen_test_graph.generate_in_memory_graph() + + stats = topo_stats_lib.topology_statistics_from_graphs( + [graph1, graph2], schema, num_bins=2 + ) + + expected_stats = statistics_lib.GraphTopologyStatistics( + node_sets={ + "n1": statistics_lib.NodeSetTopologyStatistics( + num_nodes=histogram_lib.Histogram(values=[2.0], bins=[2.0, 2.0]) + ), + "n2": statistics_lib.NodeSetTopologyStatistics( + num_nodes=histogram_lib.Histogram(values=[2.0], bins=[2.0, 2.0]) + ), + }, + edge_sets={ + "e1": statistics_lib.EdgeSetTopologyStatistics( + num_edges=histogram_lib.Histogram( + values=[2.0], bins=[2.0, 2.0] + ), + in_degree_distribution=histogram_lib.Histogram( + values=[4.0], bins=[1.0, 1.0] + ), + out_degree_distribution=histogram_lib.Histogram( + values=[2.0, 2.0], bins=[0.0, 1.0, 2.0] + ), + ), + "e2": statistics_lib.EdgeSetTopologyStatistics( + num_edges=histogram_lib.Histogram( + values=[2.0], bins=[2.0, 2.0] + ), + in_degree_distribution=histogram_lib.Histogram( + values=[4.0], bins=[1.0, 1.0] + ), + out_degree_distribution=histogram_lib.Histogram( + values=[2.0, 2.0], bins=[0.0, 1.0, 2.0] + ), + ), + }, + num_graphs=2, + ) + + test_util.assert_are_equal(self, stats, expected_stats) + + +if __name__ == "__main__": + absltest.main() diff --git a/dgf/src/api/BUILD b/dgf/src/api/BUILD index 2ef1ed9..9203f49 100644 --- a/dgf/src/api/BUILD +++ b/dgf/src/api/BUILD @@ -113,6 +113,7 @@ py_library( "//dgf/src/analyse:padding", "//dgf/src/analyse:print_schema", "//dgf/src/analyse:schema", + "//dgf/src/analyse:topology_statistics", "//dgf/src/analyse/reports:data_model", "//dgf/src/analyse/reports:reporter", "//dgf/src/analyse/topology:global_graph_topology", diff --git a/dgf/src/api/analyse.py b/dgf/src/api/analyse.py index 9b7effe..5e71218 100644 --- a/dgf/src/api/analyse.py +++ b/dgf/src/api/analyse.py @@ -20,6 +20,8 @@ from dgf.src.analyse.padding import padding_from_graph_generator from dgf.src.analyse.in_process_feature_statistics import feature_statistics_from_graphs from dgf.src.analyse.in_process_feature_statistics import feature_statistics +from dgf.src.analyse.topology_statistics import topology_statistics +from dgf.src.analyse.topology_statistics import topology_statistics_from_graphs from dgf.src.analyse.histogram import make_histogram # TODO: Use third_party/py/dgf/src/api/print.py version instead.