Skip to content

Commit

Permalink
Merge pull request #626 from graphistry/dev/dev-skrub
Browse files Browse the repository at this point in the history
Dev/dev skrub
  • Loading branch information
lmeyerov authored Feb 6, 2025
2 parents 105487b + 4a5edb8 commit 320326c
Show file tree
Hide file tree
Showing 38 changed files with 1,407 additions and 869 deletions.
23 changes: 18 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ jobs:
./bin/lint.sh
- name: Type check
env:
PYTHON_VERSION: ${{ matrix.python-version }}
run: |
source pygraphistry/bin/activate
./bin/typecheck.sh
Expand Down Expand Up @@ -101,6 +103,8 @@ jobs:
./bin/lint.sh
- name: Type check
env:
PYTHON_VERSION: ${{ matrix.python-version }}
run: |
source pygraphistry/bin/activate
./bin/typecheck.sh
Expand Down Expand Up @@ -143,6 +147,8 @@ jobs:
python -m pip install -e .[test,pygraphviz]
- name: Type check
env:
PYTHON_VERSION: ${{ matrix.python-version }}
run: |
source pygraphistry/bin/activate
./bin/typecheck.sh
Expand All @@ -159,8 +165,7 @@ jobs:

strategy:
matrix:
#python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
python-version: [3.8, 3.9]
python-version: [3.9, '3.10', 3.11, 3.12]

steps:

Expand All @@ -185,6 +190,8 @@ jobs:
python -m pip install -e .[test,testai,umap-learn]
- name: Type check
env:
PYTHON_VERSION: ${{ matrix.python-version }}
run: |
source pygraphistry/bin/activate
./bin/typecheck.sh
Expand All @@ -206,8 +213,7 @@ jobs:

strategy:
matrix:
python-version: [3.8, 3.9]
#python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
python-version: [3.9, '3.10', 3.11, 3.12]
#include:
# - python-version: 3.12
# continue-on-error: true
Expand All @@ -233,14 +239,16 @@ jobs:
source pygraphistry/bin/activate
python -m pip install --upgrade pip
python -m pip install -e .[test,testai,ai]
echo "dirty-cat: `pip show dirty-cat | grep Version`"
echo "skrub: `pip show skrub | grep Version`"
echo "pandas: `pip show pandas | grep Version`"
echo "numpy: `pip show numpy | grep Version`"
echo "scikit-learn: `pip show scikit-learn | grep Version`"
echo "scipy: `pip show scipy | grep Version`"
echo "umap-learn: `pip show umap-learn | grep Version`"
- name: Type check
env:
PYTHON_VERSION: ${{ matrix.python-version }}
run: |
source pygraphistry/bin/activate
./bin/typecheck.sh
Expand Down Expand Up @@ -270,6 +278,11 @@ jobs:
source pygraphistry/bin/activate
./bin/test-embed.sh
- name: Full DGL tests (rich featurize)
run: |
source pygraphistry/bin/activate
./bin/test-dgl.sh
test-neo4j:

Expand Down
37 changes: 37 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,43 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Development]

## [0.36.0 - 2025-02-05]

### Breaking

* `from_cugraph` returns using the src/dst bindings of `cugraph.Graph` object instead of base `Plottable`
* `pip install graphistry[umap-learn]` and `pip install graphistry[ai]` are now Python 3.9+ (was 3.8+)
* `Plottable`'s fields `_node_dbscan` / `_edge_dbscan` are now `_dbscan_nodes` / `_dbscan_edges`

### Feat

* Switch to `skrub` for feature engineering
* More AI methods support GPU path
* Support cugraph 26.10+, numpy 2.0+
* Add more umap, dbscan fields to `Plottable`

### Infra

* `[umap-learn]` + `[ai]` unpin deps - scikit, scipy, torch (now 2), etc

### Refactor

* Move more type models to models/compute/{feature,umap,cluster}
* Turn more print => logger

### Fixes

* Remove lint/type ignores and fix root causes

### Tests

* Stop ignoring warnings in featurize and umap
* python version tests use corresponding python version for mypy
* ci umap tests: py 3.8, 3.9 => 3.9..3.12
* ci ai tests: py 3.8, 3.9 => 3.9..3.12
* ci tests dgl
* plugin tests check for module imports

## [0.35.10 - 2025-01-24]

### Fixes:
Expand Down
13 changes: 13 additions & 0 deletions bin/test-dgl.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
set -ex

# Run from project root
# - Args get passed to pytest phase
# Non-zero exit code on fail

# Assume [umap-learn,test]

python -m pytest --version

python -B -m pytest -vv \
graphistry/tests/test_dgl_utils.py
8 changes: 6 additions & 2 deletions bin/typecheck.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@ set -ex

mypy --version

# Check core
mypy --config-file mypy.ini graphistry
if [ -n "$PYTHON_VERSION" ]; then
SHORT_VERSION=$(echo "$PYTHON_VERSION" | cut -d. -f1,2)
mypy --python-version "$SHORT_VERSION" --config-file mypy.ini graphistry
else
mypy --config-file mypy.ini graphistry
fi
8 changes: 4 additions & 4 deletions docker/test-cpu-umap-ai.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
set -ex


PYTHON_VERSION=${PYTHON_VERSION:-3.8} \
TEST_FILES=${@:-"graphistry/tests/test_feature_utils.py graphistry/tests/test_umap_utils.py"}

PYTHON_VERSION=${PYTHON_VERSION:-3.10} \
PIP_DEPS=${PIP_DEPS:--e .[ai,test,testai]} \
WITH_LINT=${WITH_LINT:-1} \
WITH_TYPECHECK=${WITH_TYPECHECK:-1} \
Expand All @@ -11,6 +13,4 @@ WITH_TEST=${WITH_TEST:-1} \
SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER-average_word_embeddings_komninos} \
SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER} \
./test-cpu-local.sh \
graphistry/tests/test_feature_utils.py \
graphistry/tests/test_umap_utils.py \
$@
$TEST_FILES
8 changes: 4 additions & 4 deletions docker/test-cpu-umap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
set -ex


PYTHON_VERSION=${PYTHON_VERSION:-3.8} \
TEST_FILES=${@:-"graphistry/tests/test_feature_utils.py graphistry/tests/test_umap_utils.py"}

PYTHON_VERSION=${PYTHON_VERSION:-3.9} \
PIP_DEPS=${PIP_DEPS:--e .[umap-learn,test,testai]} \
WITH_LINT=${WITH_LINT:-1} \
WITH_TYPECHECK=${WITH_TYPECHECK:-1} \
Expand All @@ -11,6 +13,4 @@ WITH_TEST=${WITH_TEST:-1} \
SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER-average_word_embeddings_komninos} \
SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER} \
./test-cpu-local.sh \
graphistry/tests/test_feature_utils.py \
graphistry/tests/test_umap_utils.py \
$@
$TEST_FILES
1 change: 0 additions & 1 deletion docker/test-gpu-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,4 @@ docker run \
${NETWORK} \
graphistry/test-gpu:${TEST_CPU_VERSION} \
--maxfail=1 \
--ignore=graphistry/tests/test_feature_utils.py \
$@
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,10 @@
('py:class', 'torch'),
('py:class', 'umap'),
('py:class', 'sentence_transformers'),
('py:class', 'dirty_cat'),
('py:class', 'sklearn'),
('py:class', 'scipy'),
('py:class', 'seaborn'),
('py:class', 'skrub'),
('py:class', 'annoy'),
('py:class', 'NetworkX graph'),
('py:class', 'Pandas dataframe'),
Expand Down
7 changes: 5 additions & 2 deletions graphistry/Engine.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from inspect import getmodule
import warnings
import numpy as np
import pandas as pd
from typing import Any, Optional, Union
from enum import Enum
from graphistry.utils.lazy_import import lazy_cudf_import


class Engine(Enum):
Expand All @@ -29,6 +29,8 @@ def resolve_engine(
g_or_df: Optional[Any] = None,
) -> Engine:

from graphistry.utils.lazy_import import lazy_cudf_import

if isinstance(engine, str):
engine = EngineAbstract(engine)

Expand All @@ -42,7 +44,8 @@ def resolve_engine(
if isinstance(g_or_df, Plottable):
if g_or_df._nodes is not None and g_or_df._edges is not None:
if not isinstance(g_or_df._nodes, type(g_or_df._edges)):
raise ValueError(f'Edges and nodes must be same type for auto engine selection, got: {type(g_or_df._edges)} and {type(g_or_df._nodes)}')
#raise ValueError(f'Edges and nodes must be same type for auto engine selection, got: {type(g_or_df._edges)} and {type(g_or_df._nodes)}')
warnings.warn(f'Edges and nodes must be same type for auto engine selection, got: {type(g_or_df._edges)} and {type(g_or_df._nodes)}')
g_or_df = g_or_df._edges if g_or_df._edges is not None else g_or_df._nodes

if g_or_df is not None:
Expand Down
24 changes: 23 additions & 1 deletion graphistry/Plottable.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
from typing_extensions import Literal
import pandas as pd

from graphistry.models.ModelDict import ModelDict
from graphistry.models.compute.chain_remote import FormatType, OutputTypeAll, OutputTypeDf, OutputTypeGraph
from graphistry.models.compute.dbscan import DBSCANEngine
from graphistry.models.compute.umap import UMAPEngineConcrete
from graphistry.plugins_types.cugraph_types import CuGraphKind
from graphistry.Engine import Engine, EngineAbstract
from graphistry.utils.json import JSONVal
Expand Down Expand Up @@ -72,11 +75,13 @@ class Plottable(object):
_node_embedding : Optional[pd.DataFrame]
_node_encoder : Optional[Any]
_node_features : Optional[pd.DataFrame]
_node_features_raw: Optional[pd.DataFrame]
_node_target : Optional[pd.DataFrame]

_edge_embedding : Optional[pd.DataFrame]
_edge_encoder : Optional[Any]
_edge_features : Optional[pd.DataFrame]
_edge_features_raw: Optional[pd.DataFrame]
_edge_target : Optional[pd.DataFrame]

_weighted_adjacency: Optional[Any]
Expand All @@ -88,10 +93,27 @@ class Plottable(object):
_xy: Optional[pd.DataFrame]

_umap : Optional[UMAP]
_umap_params: Optional[Dict[str, Any]]
_umap_engine: Optional[UMAPEngineConcrete]
_umap_params: Optional[Union[ModelDict, Dict[str, Any]]]
_umap_fit_kwargs: Optional[Dict[str, Any]]
_umap_transform_kwargs: Optional[Dict[str, Any]]

# extra umap
_n_components: int
_metric: str
_n_neighbors: int
_min_dist: float
_spread: float
_local_connectivity: int
_repulsion_strength: float
_negative_sample_rate: float
_suffix: str

_dbscan_engine: Optional[DBSCANEngine]
_dbscan_params: Optional[ModelDict]
_dbscan_nodes: Optional[Any] # fit model
_dbscan_edges: Optional[Any] # fit model

_adjacency : Optional[Any]
_entity_to_index : Optional[dict]
_index_to_entity : Optional[dict]
Expand Down
12 changes: 7 additions & 5 deletions graphistry/PlotterBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,16 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:

# the fit umap instance
self._umap = None
self._umap_engine = None
self._umap_params : Optional[Dict[str, Any]] = None
self._umap_fit_kwargs : Optional[Dict[str, Any]] = None
self._umap_transform_kwargs : Optional[Dict[str, Any]] = None

self._dbscan_engine = None
self._dbscan_params = None
self._dbscan_nodes = None # fit model
self._dbscan_edges = None # fit model

self._adjacency = None
self._entity_to_index = None
self._index_to_entity = None
Expand All @@ -216,11 +222,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
self._use_feat: bool = False
self._triplets: Optional[List] = None
self._kg_embed_dim: int = 128

# Dbscan
self._node_dbscan = None # the fit dbscan instance
self._edge_dbscan = None


# DGL
self.DGL_graph = None # the DGL graph

Expand Down
Loading

0 comments on commit 320326c

Please sign in to comment.