From a9f4297223593f8df211599277519e206c597630 Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Tue, 26 Sep 2023 08:57:51 -0500 Subject: [PATCH] Enable weights for MG similarity algorithms (#3879) This is a follow up PR to #3828 which enabled weighted for the python SG similarity algorithms. This PR also updates the tests, docstrings and remove experimental calls Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: https://github.com/rapidsai/cugraph/pull/3879 --- .../cugraph/dask/link_prediction/jaccard.py | 10 +--- .../cugraph/dask/link_prediction/overlap.py | 10 +--- .../cugraph/dask/link_prediction/sorensen.py | 10 +--- .../tests/link_prediction/test_jaccard_mg.py | 59 ++++++------------- .../tests/link_prediction/test_overlap_mg.py | 59 ++++++------------- .../tests/link_prediction/test_sorensen_mg.py | 59 ++++++------------- 6 files changed, 60 insertions(+), 147 deletions(-) diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 218e6206fc3..5362c7a9e1e 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -118,7 +118,9 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): adjacent vertices in the graph. use_weight : bool, optional (default=False) - Currently not supported + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. Returns ------- @@ -144,12 +146,6 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): vertex_pair_col_name = vertex_pair.columns - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if input_graph.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index 5540be28fd1..4bda05e3c95 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -96,7 +96,9 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): adjacent vertices in the graph. use_weight : bool, optional (default=False) - Currently not supported + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. Returns ------- @@ -122,12 +124,6 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): vertex_pair_col_name = vertex_pair.columns - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if input_graph.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 24295ac330c..163b0d0dc16 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -92,7 +92,9 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): adjacent vertices in the graph. use_weight : bool, optional (default=False) - Currently not supported + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. Returns ------- @@ -118,12 +120,6 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): vertex_pair_col_name = vertex_pair.columns - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if input_graph.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py index b56a6baae2b..ee739c9f236 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py @@ -34,6 +34,7 @@ def setup_function(): IS_DIRECTED = [False] HAS_VERTEX_PAIR = [True, False] +IS_WEIGHTED = [True, False] # ============================================================================= @@ -48,6 +49,7 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (IS_WEIGHTED, "is_weighted"), ) @@ -57,7 +59,9 @@ def input_combo(request): Simply return the current combination of params as a dictionary for use in tests or other parameterized fixtures. """ - parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param)) + parameters = dict( + zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + ) return parameters @@ -72,7 +76,10 @@ def input_expected_output(input_combo): input_data_path = input_combo["graph_file"] directed = input_combo["directed"] has_vertex_pair = input_combo["has_vertex_pair"] - G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed) + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) if has_vertex_pair: # Sample random vertices from the graph and compute the two_hop_neighbors # with those seeds @@ -84,7 +91,9 @@ def input_expected_output(input_combo): vertex_pair = None input_combo["vertex_pair"] = vertex_pair - sg_cugraph_jaccard = cugraph.experimental.jaccard(G, input_combo["vertex_pair"]) + sg_cugraph_jaccard = cugraph.jaccard( + G, input_combo["vertex_pair"], use_weight=is_weighted + ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for # them, and if not present they will have to re-run the same cuGraph call. @@ -104,6 +113,7 @@ def input_expected_output(input_combo): ddf, source="src", destination="dst", + edge_attr="value" if is_weighted else None, renumber=True, store_transposed=True, ) @@ -122,8 +132,11 @@ def input_expected_output(input_combo): def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output): dg = input_expected_output["MGGraph"] + use_weight = input_expected_output["is_weighted"] - result_jaccard = benchmark(dcg.jaccard, dg, input_expected_output["vertex_pair"]) + result_jaccard = benchmark( + dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight + ) result_jaccard = ( result_jaccard.compute() @@ -151,41 +164,3 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output): assert len(jaccard_coeff_diffs1) == 0 assert len(jaccard_coeff_diffs2) == 0 - - -@pytest.mark.mg -def test_dask_mg_weighted_jaccard(dask_client): - input_data_path = datasets[0] - chunksize = dcg.get_chunksize(input_data_path) - ddf = dask_cudf.read_csv( - input_data_path, - chunksize=chunksize, - delimiter=" ", - names=["src", "dst", "value"], - dtype=["int32", "int32", "float32"], - ) - - dg = cugraph.Graph(directed=False) - dg.from_dask_cudf_edgelist( - ddf, - source="src", - destination="dst", - edge_attr="value", - renumber=True, - store_transposed=True, - ) - with pytest.raises(ValueError): - dcg.jaccard(dg) - - dg = cugraph.Graph(directed=False) - dg.from_dask_cudf_edgelist( - ddf, - source="src", - destination="dst", - edge_attr="value", - store_transposed=True, - ) - - use_weight = True - with pytest.raises(ValueError): - dcg.jaccard(dg, use_weight=use_weight) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py index ce4bf619f47..87407d7b59c 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py @@ -34,6 +34,7 @@ def setup_function(): IS_DIRECTED = [False] HAS_VERTEX_PAIR = [True, False] +IS_WEIGHTED = [True, False] # ============================================================================= @@ -48,6 +49,7 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (IS_WEIGHTED, "is_weighted"), ) @@ -57,7 +59,9 @@ def input_combo(request): Simply return the current combination of params as a dictionary for use in tests or other parameterized fixtures. """ - parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param)) + parameters = dict( + zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + ) return parameters @@ -72,7 +76,10 @@ def input_expected_output(input_combo): input_data_path = input_combo["graph_file"] directed = input_combo["directed"] has_vertex_pair = input_combo["has_vertex_pair"] - G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed) + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) if has_vertex_pair: # Sample random vertices from the graph and compute the two_hop_neighbors # with those seeds @@ -84,7 +91,9 @@ def input_expected_output(input_combo): vertex_pair = None input_combo["vertex_pair"] = vertex_pair - sg_cugraph_overlap = cugraph.experimental.overlap(G, input_combo["vertex_pair"]) + sg_cugraph_overlap = cugraph.overlap( + G, input_combo["vertex_pair"], use_weight=is_weighted + ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for # them, and if not present they will have to re-run the same cuGraph call. @@ -104,6 +113,7 @@ def input_expected_output(input_combo): ddf, source="src", destination="dst", + edge_attr="value" if is_weighted else None, renumber=True, store_transposed=True, ) @@ -125,8 +135,11 @@ def input_expected_output(input_combo): def test_dask_mg_overlap(dask_client, benchmark, input_expected_output): dg = input_expected_output["MGGraph"] + use_weight = input_expected_output["is_weighted"] - result_overlap = benchmark(dcg.overlap, dg, input_expected_output["vertex_pair"]) + result_overlap = benchmark( + dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight + ) result_overlap = ( result_overlap.compute() @@ -154,41 +167,3 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output): assert len(overlap_coeff_diffs1) == 0 assert len(overlap_coeff_diffs2) == 0 - - -@pytest.mark.mg -def test_dask_mg_weighted_overlap(): - input_data_path = datasets[0] - chunksize = dcg.get_chunksize(input_data_path) - ddf = dask_cudf.read_csv( - input_data_path, - chunksize=chunksize, - delimiter=" ", - names=["src", "dst", "value"], - dtype=["int32", "int32", "float32"], - ) - - dg = cugraph.Graph(directed=False) - dg.from_dask_cudf_edgelist( - ddf, - source="src", - destination="dst", - edge_attr="value", - renumber=True, - store_transposed=True, - ) - with pytest.raises(ValueError): - dcg.overlap(dg) - - dg = cugraph.Graph(directed=False) - dg.from_dask_cudf_edgelist( - ddf, - source="src", - destination="dst", - edge_attr="value", - store_transposed=True, - ) - - use_weight = True - with pytest.raises(ValueError): - dcg.overlap(dg, use_weight=use_weight) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py index af6b60771a0..66832d08427 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py @@ -35,6 +35,7 @@ def setup_function(): IS_DIRECTED = [False] HAS_VERTEX_PAIR = [True, False] +IS_WEIGHTED = [True, False] # ============================================================================= @@ -49,6 +50,7 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (IS_WEIGHTED, "is_weighted"), ) @@ -58,7 +60,9 @@ def input_combo(request): Simply return the current combination of params as a dictionary for use in tests or other parameterized fixtures. """ - parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param)) + parameters = dict( + zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + ) return parameters @@ -73,7 +77,10 @@ def input_expected_output(input_combo): input_data_path = input_combo["graph_file"] directed = input_combo["directed"] has_vertex_pair = input_combo["has_vertex_pair"] - G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed) + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) if has_vertex_pair: # Sample random vertices from the graph and compute the two_hop_neighbors # with those seeds @@ -85,7 +92,9 @@ def input_expected_output(input_combo): vertex_pair = None input_combo["vertex_pair"] = vertex_pair - sg_cugraph_sorensen = cugraph.experimental.sorensen(G, input_combo["vertex_pair"]) + sg_cugraph_sorensen = cugraph.sorensen( + G, input_combo["vertex_pair"], use_weight=is_weighted + ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for # them, and if not present they will have to re-run the same cuGraph call. @@ -105,6 +114,7 @@ def input_expected_output(input_combo): ddf, source="src", destination="dst", + edge_attr="value" if is_weighted else None, renumber=True, store_transposed=True, ) @@ -124,8 +134,11 @@ def input_expected_output(input_combo): def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output): dg = input_expected_output["MGGraph"] + use_weight = input_expected_output["is_weighted"] - result_sorensen = benchmark(dcg.sorensen, dg, input_expected_output["vertex_pair"]) + result_sorensen = benchmark( + dcg.sorensen, dg, input_expected_output["vertex_pair"], use_weight=use_weight + ) result_sorensen = ( result_sorensen.compute() @@ -153,41 +166,3 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output): assert len(sorensen_coeff_diffs1) == 0 assert len(sorensen_coeff_diffs2) == 0 - - -@pytest.mark.mg -def test_dask_mg_weighted_sorensen(dask_client): - input_data_path = datasets[0] - chunksize = dcg.get_chunksize(input_data_path) - ddf = dask_cudf.read_csv( - input_data_path, - chunksize=chunksize, - delimiter=" ", - names=["src", "dst", "value"], - dtype=["int32", "int32", "float32"], - ) - - dg = cugraph.Graph(directed=False) - dg.from_dask_cudf_edgelist( - ddf, - source="src", - destination="dst", - edge_attr="value", - renumber=True, - store_transposed=True, - ) - with pytest.raises(ValueError): - dcg.sorensen(dg) - - dg = cugraph.Graph(directed=False) - dg.from_dask_cudf_edgelist( - ddf, - source="src", - destination="dst", - edge_attr="value", - store_transposed=True, - ) - - use_weight = True - with pytest.raises(ValueError): - dcg.sorensen(dg, use_weight=use_weight)