diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 662165d..a463a2a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -23,4 +23,4 @@ jobs: pip install .[test] - name: Test with pytest run: | - pytest + pytest -vv diff --git a/README.md b/README.md index 0ddf9c4..e85ee47 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This Python package provides a simple interface to interact with Fedivertex: htt Our package automatically downloads the dataset from Kaggle and loads graphs in a usable format (i.e., NetworkX). The Fediverse Graph dataset provides graphs for different decentralized social media. -These graphs represents the interactions between servers in these decentralized social media. +These graphs model the interactions between servers in these decentralized social media. The graph type corresponds to the type of interactions modelled by the graph. Finally, the dataset provides the graphs obtained on different dates, so the users can analyze the evolution of the interactions. diff --git a/fedivertex/cache.py b/fedivertex/cache.py new file mode 100644 index 0000000..8947430 --- /dev/null +++ b/fedivertex/cache.py @@ -0,0 +1,301 @@ +import os +import shutil +import zipfile +from datetime import datetime +from enum import Enum +from pathlib import Path +from typing import Optional + +import requests +from platformdirs import user_cache_dir +from tqdm import tqdm + +from .exceptions import CacheError, DownloadError + +_CHUNK_SIZE = 1024 * 1024 + +DEFAULT_CACHE_DIR = user_cache_dir( + appname="fedivertex-dataset", + appauthor="MarcDamie", # optional but recommended on Windows +) + +DATASET_METADATA_URL = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" +LIGHT_DATASET_METADATA_URL = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" +DATASET_URL = ( + "https://www.kaggle.com/api/v1/datasets/download/marcdamie/fediverse-graph-dataset" +) +LIGHT_DATASET_URL = "https://www.kaggle.com/api/v1/datasets/download/marcdamie/fediverse-graph-dataset-reduced" + + +class CacheStatus(Enum): + CORRUPTED = -2 + ABSENT = -1 + OUTDATED = 0 + UPTODATE = 1 + + +def read_last_update(filepath): + """Read the last update timestamp from a cache file. + + :param filepath: Path to the file containing the last update timestamp. + :type filepath: Path + :raises CacheError: if the file content is not a valid ISO datetime. + :return: Parsed datetime of the last update. + :rtype: datetime + """ + + try: + with open(filepath, "r", encoding="utf-8") as update_file: + return datetime.fromisoformat(update_file.read()) + except ValueError: + raise CacheError("Cache corrupted (invalid update date), download necessary.") + + +class DatasetInfo: + """Container for dataset-related paths and metadata. + + This class centralizes all information required to interact with the dataset, + including cache locations, download URLs, and last update timestamps. + + :param cache_dir: Root directory for the cache. + :type cache_dir: Path + :param light_dataset: Whether to use the reduced version of the dataset. + :type light_dataset: bool + :param cache_only: If True, only local cache is used (no network requests). + :type cache_only: bool + :raises CacheError: if cache_only=True and no cache is available. + :raises DownloadError: if metadata cannot be retrieved from the remote source. + """ + + def __init__(self, cache_dir: Path, light_dataset: bool, cache_only: bool): + self.cache_root = cache_dir + self.light_version = light_dataset + self.dataset_dir = cache_dir / ("reduced" if self.light_version else "full") + metadata_url = ( + LIGHT_DATASET_METADATA_URL if self.light_version else DATASET_METADATA_URL + ) + self.data_url = LIGHT_DATASET_URL if self.light_version else DATASET_URL + + if cache_only: + last_update_file = self.dataset_dir / "last_update.txt" + if last_update_file.exists(): + self.last_update = read_last_update(last_update_file) + else: + raise CacheError("No cache found... incompatible with cache_only=True") + else: + try: + resp = requests.get(metadata_url, timeout=10) + if resp.status_code != 200: + raise DownloadError( + f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" + ) + metadata = resp.json() + date = metadata["dateModified"] + except requests.RequestException as err: + raise DownloadError( + f"Could not retrieve dataset metadata ({str(err)})" + ) from err + except KeyError as err: + raise DownloadError( + "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" + ) from err + + try: + self.last_update = datetime.fromisoformat(date) + except ValueError as err: + raise DownloadError( + f"Could not retrieve dataset date (Invalid format '{date}')" + ) from err + + +def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase + """Download a file from an HTTP endpoint with progress reporting. + + The file is first written to a temporary location and then atomically + renamed to avoid partial or corrupted downloads. + + :param url: URL of the file to download. + :type url: str + :param filepath: Destination path for the downloaded file. + :type filepath: Path + :raises requests.RequestException: if the HTTP request fails. + :return: None + :rtype: None + """ + + response = requests.get( + url, + stream=True, + timeout=10, + ) + response.raise_for_status() + total = int(response.headers.get("Content-Length", 0)) + + tmp_path = filepath.with_suffix(".tmp") + with ( + tmp_path.open("wb") as file, + tqdm( + desc="Downloading the dataset...", + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar, + ): + for data in response.iter_content(chunk_size=_CHUNK_SIZE): + size = file.write(data) + bar.update(size) + + tmp_path.replace(filepath) + + +def clear_default_cache(): + """Remove the entire default cache directory. + + This deletes all cached datasets stored in the default cache location. + + :return: None + :rtype: None + """ + + cache_dir = Path(DEFAULT_CACHE_DIR) + + if cache_dir.exists(): + shutil.rmtree(cache_dir) + + +def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: + """Check whether the local cache is up-to-date with the remote dataset. + + :param dataset_info: Dataset information object. + :type dataset_info: DatasetInfo + :return: Cache status indicating whether the dataset is up-to-date, + outdated, absent, or corrupted. + :rtype: CacheStatus + """ + + update_file_path = dataset_info.dataset_dir / "last_update.txt" + + if update_file_path.exists(): + try: + last_local_update = read_last_update(update_file_path) + except CacheError as err: + print(str(err)) + return CacheStatus.CORRUPTED + + print("Cache found, checking for updates...") + + if last_local_update >= dataset_info.last_update: + print("Cache is up-to-date, no download necessary.") + return CacheStatus.UPTODATE + else: + print("Cache is outdated, download necessary.") + return CacheStatus.OUTDATED + else: + print("No cache found, download necessary.") + return CacheStatus.ABSENT + + +def download_dataset(dataset_info: DatasetInfo): + """Download and extract the dataset into the cache directory. + + The dataset archive is downloaded, extracted, and normalized so that + the dataset directory has a stable name independent of versioning. + + :param dataset_info: Dataset information object. + :type dataset_info: DatasetInfo + :raises requests.RequestException: if the download fails. + :raises zipfile.BadZipFile: if the archive is invalid. + :return: None + :rtype: None + """ + + archive_path = dataset_info.cache_root / "archive.zip" + + download_from_http(dataset_info.data_url, archive_path) + + print("Decompressing the dataset...") + with zipfile.ZipFile(archive_path) as zip: + zip.extractall(dataset_info.cache_root) + + # Rename the extracted folder to have a fixed name (without version) + roots = {Path(m).parts[0] for m in zip.namelist() if m.strip()} + if len(roots) == 1: + old_root = dataset_info.cache_root / next(iter(roots)) + old_root.rename(dataset_info.dataset_dir) + + archive_path.unlink() + + +def create_update_date_file(dataset_info: DatasetInfo): + """Write the dataset last update timestamp to the cache. + + :param dataset_info: Dataset information object. + :type dataset_info: DatasetInfo + :return: None + :rtype: None + """ + + update_file_path = dataset_info.dataset_dir / "last_update.txt" + + with open(update_file_path, "w", encoding="utf-8") as update_file: + update_file.write(dataset_info.last_update.isoformat()) + + +def init_cache( + light_dataset: bool, cache_dir: Optional[Path | str] = None, cache_only=False +) -> DatasetInfo: + """Initialize dataset cache metadata without downloading data. + + This function prepares the cache directory and returns a DatasetInfo + object describing the dataset configuration. + + :param light_dataset: Whether to use the reduced dataset version. + :type light_dataset: bool + :param cache_dir: Optional custom cache directory. + :type cache_dir: Optional[Path | str] + :param cache_only: If True, only local cache is used (no network requests). + :type cache_only: bool + :return: Dataset information object. + :rtype: DatasetInfo + """ + + if cache_dir is None: + cache_dir = DEFAULT_CACHE_DIR + cache_dir = Path(cache_dir) + # Create the main cache directory if necessary + os.makedirs(cache_dir, exist_ok=True) + return DatasetInfo(cache_dir, light_dataset, cache_only) + + +def load_dataset( + light_dataset: bool, cache_dir: Optional[Path | str] = None, cache_only=False +) -> DatasetInfo: + """Ensure the dataset is available locally and up-to-date. + + This function checks the cache status and downloads the dataset if necessary, + unless cache_only is set to True. + + :param light_dataset: Whether to use the reduced dataset version. + :type light_dataset: bool + :param cache_dir: Optional custom cache directory. + :type cache_dir: Optional[Path | str] + :param cache_only: If True, only local cache is used (no download allowed). + :type cache_only: bool + :return: Dataset information object pointing to the local dataset. + :rtype: DatasetInfo + """ + + dataset_info = init_cache(light_dataset, cache_dir, cache_only) + + if not cache_only: + cache_status = check_for_update(dataset_info) + if cache_status != CacheStatus.UPTODATE: + if dataset_info.dataset_dir.exists(): + shutil.rmtree(dataset_info.dataset_dir) + + download_dataset(dataset_info) + + create_update_date_file(dataset_info) + + return dataset_info diff --git a/fedivertex/exceptions.py b/fedivertex/exceptions.py new file mode 100644 index 0000000..7858568 --- /dev/null +++ b/fedivertex/exceptions.py @@ -0,0 +1,14 @@ +class FedivertexException(Exception): + pass + + +class DownloadError(FedivertexException): + pass + + +class CacheError(FedivertexException): + pass + + +class InteractionError(FedivertexException): + pass diff --git a/fedivertex/main.py b/fedivertex/main.py index a57f7e5..e90fa9e 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -1,12 +1,16 @@ -import json +import csv +import os +from pathlib import Path from types import NoneType from typing import List, Optional, Tuple -import mlcroissant as mlc import networkx as nx import networkx_temporal as tx from tqdm import tqdm +from .cache import load_dataset +from .exceptions import InteractionError + class GraphLoader: VALID_GRAPH_TYPES = { @@ -25,18 +29,11 @@ class GraphLoader: } UNDIRECTED_GRAPHS = ["federation"] - def __init__(self, light_version=True): - self.light_version = light_version - if self.light_version: - url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" - else: - url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" - try: - self.dataset = mlc.Dataset(jsonld=url) - except json.JSONDecodeError as err: - raise SystemError( - "Unexpected error from Croissant (try to empty Croissant's cache in ~/.cache/croissant)" - ) from err + def __init__(self, light_version=True, cache_dir=None, cache_only=False): + self.DATASET_INFO = load_dataset(light_version, cache_dir, cache_only) + + def _graph_dir(self, software: str, graph_type: str, date: str) -> Path: + return self.DATASET_INFO.dataset_dir / software / graph_type / date def _check_input(self, software: str, graph_type: str) -> NoneType: """Verify that (software,graph type) combination exists @@ -45,23 +42,27 @@ def _check_input(self, software: str, graph_type: str) -> NoneType: :type software: str :param graph_type: graph type :type graph_type: str - :raises ValueError: if the software does not exist in the dataset - :raises ValueError: if the graph type does not exist for a given software + :raises InteractionError: if the software does not exist in the dataset + :raises InteractionError: if the graph type does not exist for a given software :return: Nothing :rtype: NoneType """ if software not in self.VALID_GRAPH_TYPES.keys(): - raise ValueError( + raise InteractionError( f"Invalid software! Valid software: {list(self.VALID_GRAPH_TYPES.keys())}" ) if graph_type not in self.VALID_GRAPH_TYPES[software]: - raise ValueError( + raise InteractionError( f"{graph_type} is not a valid graph type for {software}. Valid types: {self.VALID_GRAPH_TYPES[software]}" ) - if self.light_version and software == "mastodon" and graph_type == "federation": - raise ValueError( + if ( + self.DATASET_INFO.light_version + and software == "mastodon" + and graph_type == "federation" + ): + raise InteractionError( f"The graph {software} {graph_type} is not included in the light version of Fedivertex\n" "To download the full version, generate the dataset loader as follows: `GraphLoader(light_version=False)`" ) @@ -76,20 +77,20 @@ def _fetch_date_index(self, software: str, graph_type: str, index: int) -> str: :type graph_type: str :param index: :type index: int - :raises ValueError: if there is no graph available of the given type. - :raises ValueError: if the index is invalid + :raises InteractionError: if there is no graph available of the given type. + :raises InteractionError: if the index is invalid :return: date :rtype: str """ dates = self.list_available_dates(software, graph_type) if len(dates) == 0: - raise ValueError(f"No graph available for {software}+{graph_type}") + raise InteractionError(f"No graph available for {software}+{graph_type}") try: return dates[index] except Exception as err: - raise ValueError("Invalid index: " + str(index)) from err + raise InteractionError("Invalid index: " + str(index)) from err def _fetch_latest_date(self, software: str, graph_type: str) -> str: """Returns the latest date available for a given graph. @@ -98,14 +99,14 @@ def _fetch_latest_date(self, software: str, graph_type: str) -> str: :type software: str :param graph_type: :type graph_type: str - :raises ValueError: if there is no graph available of the given type. + :raises InteractionError: if there is no graph available of the given type. :return: date :rtype: str """ dates = self.list_available_dates(software, graph_type) if len(dates) == 0: - raise ValueError(f"No graph available for {software}+{graph_type}") + raise InteractionError(f"No graph available for {software}+{graph_type}") return dates[-1] @@ -114,7 +115,7 @@ def list_all_software(self) -> List[str]: def list_graph_types(self, software: str) -> List[str]: if software not in self.VALID_GRAPH_TYPES.keys(): - raise ValueError( + raise InteractionError( f"Invalid software! Valid software: {list(self.VALID_GRAPH_TYPES.keys())}" ) @@ -131,17 +132,9 @@ def list_available_dates(self, software: str, graph_type: str) -> List[str]: :rtype: List[str] """ self._check_input(software, graph_type) + graph_path = self.DATASET_INFO.dataset_dir / software / graph_type - record_sets = list(self.dataset.metadata.record_sets) - dates = [] - for record_set in record_sets: - if "interactions.csv" not in record_set.uuid: - continue - - software_i, graph_type_i, date_i, _file = record_set.uuid.split("/") - if software_i == software and graph_type_i == graph_type: - dates.append(date_i) - + dates = list(os.listdir(graph_path)) dates.sort() return dates @@ -169,14 +162,14 @@ def get_graph( :type only_largest_component: bool, optional :param disable_tqdm: disables the TQDM progress bars, defaults to False :type disable_tqdm: bool, optional - :raises ValueError: if both a date and an index are provided. + :raises InteractionError: if both a date and an index are provided. :return: a graph in the NetworkX format :rtype: nx.Graph """ self._check_input(software, graph_type) if index is not None and date is not None: - raise ValueError( + raise InteractionError( "You must provide either the date or the index of the graph, not both." ) @@ -189,49 +182,53 @@ def get_graph( assert date is not None - interactions_csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - interaction_records = self.dataset.records(interactions_csv_file) - - instances_csv_file = f"{software}/{graph_type}/{date}/instances.csv" - instance_records = self.dataset.records(instances_csv_file) - if graph_type in self.UNDIRECTED_GRAPHS: graph = nx.Graph() else: graph = nx.DiGraph() - for record in tqdm( - instance_records, desc="Adding the nodes", disable=disable_tqdm - ): - host = record[instances_csv_file + "/host"].decode() - graph.add_node(host) - graph.nodes[host]["domain"] = host.split("[DOT]")[-1] - for col, val in record.items(): - col_name = col.split("/")[-1] - if type(val) is bytes: - val = val.decode() - if col_name not in ["host", "Id", "Label"]: - graph.nodes[host][col_name] = val - - for record in tqdm( - interaction_records, desc="Adding the edges", disable=disable_tqdm - ): - source = record[interactions_csv_file + "/Source"].decode() - target = record[interactions_csv_file + "/Target"].decode() - weight = record[interactions_csv_file + "/Weight"] - graph.add_edge(source, target, weight=weight) - - if only_largest_component: - if graph_type in self.UNDIRECTED_GRAPHS: - largest_cc = max(nx.connected_components(graph), key=len) - else: - largest_cc = max( - nx.strongly_connected_components(graph), key=len, default=() - ) + instances_csv_file = ( + self._graph_dir(software, graph_type, date) / "instances.csv" + ) + with open(instances_csv_file, "r", encoding="utf-8") as csvfile: + record_reader = csv.DictReader(csvfile) + for record in tqdm( + record_reader, desc="Adding the nodes", disable=disable_tqdm + ): + host = record["host"] + graph.add_node(host) + graph.nodes[host]["domain"] = host.split("[DOT]")[-1] + for col, val in record.items(): + col_name = col.split("/")[-1] + + if col_name not in ["host", "Id", "Label"]: + graph.nodes[host][col_name] = val + + interactions_csv_file = ( + self._graph_dir(software, graph_type, date) / "interactions.csv" + ) + + with open(interactions_csv_file, "r", encoding="utf-8") as csvfile: + record_reader = csv.DictReader(csvfile) + for record in tqdm( + record_reader, desc="Adding the edges", disable=disable_tqdm + ): + source = record["Source"] + target = record["Target"] + weight = record["Weight"] + graph.add_edge(source, target, weight=weight) + + if only_largest_component: + if isinstance(graph, nx.DiGraph): + largest_cc = max( + nx.strongly_connected_components(graph), key=len, default=() + ) + else: + largest_cc = max(nx.connected_components(graph), key=len) - graph = graph.subgraph(largest_cc).copy() + graph = graph.subgraph(largest_cc).copy() - return graph + return graph def get_temporal_graph( self, @@ -254,7 +251,7 @@ def get_temporal_graph( :type date: Optional[Tuple[str, str]], optional :param disable_tqdm: disables the TQDM progress bars, defaults to False :type disable_tqdm: bool, optional - :raises ValueError: if both a date and an index are provided. + :raises InteractionError: if both a date and an index are provided. :return: a graph in the NetworkX format :rtype: tx.TemporalGraph """ @@ -275,16 +272,16 @@ def get_temporal_graph( # Fetch all graphs selected_dates = availables_dates elif index is not None and date is not None: - raise ValueError( + raise InteractionError( "You must provide either the date or the index range of the graph, not both." ) elif index is not None: if len(index) > 2: - raise ValueError("Incorrect format for the index range") + raise InteractionError("Incorrect format for the index range") if index[0] > index[1]: - raise ValueError("Incorrect index range") + raise InteractionError("Incorrect index range") if index[0] < 0 or index[1] > len(availables_dates) - 1: - raise ValueError( + raise InteractionError( f"Indices are out of the acceptable range (0,{len(availables_dates) - 1})" ) @@ -292,20 +289,20 @@ def get_temporal_graph( else: # date is not None: assert date is not None if len(date) > 2: - raise ValueError("Incorrect format for the date range") + raise InteractionError("Incorrect format for the date range") min_date, max_date = date try: min_date = int(min_date) max_date = int(max_date) - except ValueError as err: - raise ValueError("Invalid date format") from err + except InteractionError as err: + raise InteractionError("Invalid date format") from err if ( min_date > int(availables_dates[-1]) or int(availables_dates[0]) > max_date ): - raise ValueError( + raise InteractionError( f"Indices not covering the available dates: ({availables_dates[0]},{availables_dates[-1]})" ) diff --git a/setup.py b/setup.py index c26e97d..a1a287a 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="fedivertex", - version="1.0.0", + version="1.1.0", author="Marc DAMIE", author_email="marc.damie@inria.fr", description="Interface to download and interact with Fedivertex, the Fediverse Graph Dataset", @@ -15,12 +15,13 @@ long_description_content_type="text/markdown", packages=find_packages(), license="GPLv3", - python_requires=">=3.10", # To be compatible with mlcroissant + python_requires=">=3.10", install_requires=[ - "numpy<2.0", # To be compatible with mlcroissant - "mlcroissant", + "numpy<2.0", # Necessary for networkx-temporal "networkx", "networkx-temporal", + "platformdirs", + "requests", "tqdm", ], extras_require={"test": ["pytest", "pytest-coverage"]}, diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..03b21bc --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,55 @@ +from fedivertex import GraphLoader + + +def test_basic_lists(): + software_list = [ + "bookwyrm", + "friendica", + "lemmy", + "mastodon", + "misskey", + "peertube", + "pleroma", + ] + + loader = GraphLoader() + assert loader.list_all_software() == software_list + + for software in software_list: + assert loader.list_graph_types(software) == loader.VALID_GRAPH_TYPES[software] + + +def test_available_dates(): + loader = GraphLoader() + peertube_dates = loader.list_available_dates("peertube", "follow") + assert set(peertube_dates).issuperset( + { + "20250203", + "20250210", + "20250217", + "20250224", + "20250303", + "20250311", + "20250317", + "20250324", + } + ) + + peertube_dates.sort() + assert loader._fetch_latest_date("peertube", "follow") == peertube_dates[-1] + + +def test_get_temporal_graph(): + loader = GraphLoader() + + temporal_graph = loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250617") + ) + assert len(temporal_graph.temporal_nodes()) == 1157 + assert len(temporal_graph.temporal_edges()) == 310695 + assert temporal_graph.number_of_snapshots() == 20 + + temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7)) + assert len(temporal_graph.temporal_nodes()) == 991 + assert len(temporal_graph.temporal_edges()) == 133852 + assert temporal_graph.number_of_snapshots() == 8 diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..03861c3 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,58 @@ +import os +from pathlib import Path + +from fedivertex import GraphLoader +from fedivertex.cache import DEFAULT_CACHE_DIR, clear_default_cache + + +def test_cache_removal(): + cache_path = Path(DEFAULT_CACHE_DIR) + assert cache_path.exists() + + clear_default_cache() + + assert not cache_path.exists() + + +def test_cache_status(capsys): + clear_default_cache() + _loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "No cache found, download necessary.\nDecompressing the dataset...\n" + == captured.out + ) + del _loader + + loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "Cache found, checking for updates...\nCache is up-to-date, no download necessary.\n" + == captured.out + ) + + update_file_path = loader.DATASET_INFO.dataset_dir / "last_update.txt" + os.remove(update_file_path) + with open(update_file_path, "w") as update_file: + update_file.write("INVALID DATA") + + del loader + + _loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "Cache corrupted (invalid update date), download necessary.\nDecompressing the dataset...\n" + == captured.out + ) + del _loader + + os.remove(update_file_path) + with open(update_file_path, "w") as update_file: + update_file.write("2016-04-24T12:08:29.887") + + _loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "Cache found, checking for updates...\nCache is outdated, download necessary.\nDecompressing the dataset...\n" + == captured.out + ) diff --git a/tests/test_consistency.py b/tests/test_consistency.py new file mode 100644 index 0000000..fdeeb37 --- /dev/null +++ b/tests/test_consistency.py @@ -0,0 +1,128 @@ +import pytest + +from fedivertex import GraphLoader + + +def test_index_selection(): + loader = GraphLoader() + + assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" + + latest_date = loader._fetch_latest_date("peertube", "follow") + assert loader._fetch_date_index("peertube", "follow", -1) == latest_date + + +def _iter_software_graph(): + loader = GraphLoader() + for software, graph_types in loader.VALID_GRAPH_TYPES.items(): + for graph_type in graph_types: + if software == "mastodon" and graph_type == "federation": + continue + yield software, graph_type + + +@pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) +def test_get_graph_selection(software, graph_type): + loader = GraphLoader( + cache_only=True + ) # Avoids to fetch the metadata again and again + + date = loader._fetch_latest_date(software, graph_type) + + # Test date selection + graph1 = loader.get_graph(software, graph_type, date=date) + + if not graph_type == "federation": # Because Federation is undirected + csv_file = ( + loader.DATASET_INFO.dataset_dir + / software + / graph_type + / date + / "interactions.csv" + ) + + with open(csv_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + line_count -= 1 # Remove the header from the count + + assert graph1.number_of_edges() == line_count + + # Test index selection + graph2 = loader.get_graph(software, graph_type, index=-1) + assert graph1.number_of_edges() == graph2.number_of_edges() + + available_dates = loader.list_available_dates(software, graph_type) + date = available_dates[0] + graph3 = loader.get_graph(software, graph_type, date=date) + + graph4 = loader.get_graph(software, graph_type, index=0) + assert graph3.number_of_edges() == graph4.number_of_edges() + + +def _iter_software_graph_date(): + loader = GraphLoader(cache_only=True) + for software, graph_types in loader.VALID_GRAPH_TYPES.items(): + for graph_type in graph_types: + if graph_type == "federation": # Because we want directed graphs + continue + for date in loader.list_available_dates(software, graph_type): + yield software, graph_type, date + + +@pytest.mark.parametrize("software,graph_type,date", list(_iter_software_graph_date())) +def test_get_graph_sizes(software, graph_type, date): + loader = GraphLoader(cache_only=True) + + graph = loader.get_graph(software, graph_type, date=date) + csv_file = ( + loader.DATASET_INFO.dataset_dir + / software + / graph_type + / date + / "interactions.csv" + ) + + with open(csv_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + line_count -= 1 # Remove the header from the count + + assert graph.number_of_edges() == line_count # Verify that we load all the edges + # NB: an error can also occur in case of data cleaning issue in the dataset + + +def test_graph_consistency(): + loader = GraphLoader(cache_only=True) + + # Check graph consistency + peertube_graph = loader.get_graph("peertube", "follow", date="20250324") + assert peertube_graph.number_of_edges() == 19171 + assert peertube_graph.number_of_nodes() == 883 + + # Check node attributes + assert peertube_graph.nodes["aperi[DOT]tube"] == { + "domain": "tube", + "totalUsers": "39", + "totalDailyActiveUsers": "0.0", + "totalWeeklyActiveUsers": "4.0", + "totalMonthlyActiveUsers": "8.0", + "totalLocalVideos": "638", + "totalVideos": "1287", + "totalLocalPlaylists": "26.0", + "totalVideoComments": "4632", + "totalLocalVideoComments": "44", + "totalLocalVideoViews": "106216", + "serverVersion": "7.1.0", + } + + # Check largest component consistency + peertube_graph = loader.get_graph( # DIRECTED GRAPH + "peertube", "follow", date="20250324", only_largest_component=True + ) + assert peertube_graph.number_of_edges() == 7450 + assert peertube_graph.number_of_nodes() == 264 + + bookwyrm_graph = loader.get_graph( + "bookwyrm", "federation", date="20250324", only_largest_component=True + ) + assert bookwyrm_graph.number_of_nodes() == 70 + assert bookwyrm_graph.number_of_edges() == 1827 diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100644 index 0000000..5f36215 --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,83 @@ +from pathlib import Path + +import pytest + +from fedivertex import GraphLoader +from fedivertex.cache import DEFAULT_CACHE_DIR, clear_default_cache +from fedivertex.exceptions import CacheError, InteractionError + + +def test_list_error(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader.list_graph_types("NON-EXISTING SOFTWARE") + + +def test_cache_only_errors(): + cache_path = Path(DEFAULT_CACHE_DIR) + assert cache_path.exists() + loader = GraphLoader(cache_only=True) + # No error because the cache exists + + # Cache corruption + update_file_path = loader.DATASET_INFO.dataset_dir / "last_update.txt" + update_file_path.unlink() + with open(update_file_path, "w") as update_file: + update_file.write("INVALID DATA") + + del loader + + assert cache_path.exists() + with pytest.raises(CacheError): # Corrupted cache + _loader = GraphLoader(cache_only=True) + + clear_default_cache() + + assert not cache_path.exists() + with pytest.raises(CacheError): # Missing cache + _loader = GraphLoader(cache_only=True) + + +def test_index_selection_error(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader._fetch_date_index("peertube", "follow", 10000000000000000000000000) + + +def test_get_graph_errors(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader.get_graph("NON-EXISTING", "federation") + + with pytest.raises(InteractionError): + loader.get_graph("peertube", "NON-EXISTING") + + with pytest.raises(InteractionError): + loader.get_graph("peertube", "follow", date="20250203", index=3) + + +def test_get_temporal_graph_errors(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader.get_temporal_graph("NON-EXISTING", "federation") + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "NON-EXISTING") + + with pytest.raises(InteractionError): + loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) + ) + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) diff --git a/tests/test_loader.py b/tests/test_loader.py deleted file mode 100644 index 7f541cf..0000000 --- a/tests/test_loader.py +++ /dev/null @@ -1,205 +0,0 @@ -import pytest - -from fedivertex import GraphLoader - - -def test_basic_lists(): - software_list = [ - "bookwyrm", - "friendica", - "lemmy", - "mastodon", - "misskey", - "peertube", - "pleroma", - ] - - loader = GraphLoader() - assert loader.list_all_software() == software_list - - for software in software_list: - assert loader.list_graph_types(software) == loader.VALID_GRAPH_TYPES[software] - - with pytest.raises(ValueError): - loader.list_graph_types("NON-EXISTING SOFTWARE") - - -def test_available_dates(): - loader = GraphLoader() - peertube_dates = loader.list_available_dates("peertube", "follow") - assert set(peertube_dates).issuperset( - { - "20250203", - "20250210", - "20250217", - "20250224", - "20250303", - "20250311", - "20250317", - "20250324", - } - ) - - peertube_dates.sort() - assert loader._fetch_latest_date("peertube", "follow") == peertube_dates[-1] - - -def test_index_selection(): - loader = GraphLoader() - - with pytest.raises(ValueError): - loader._fetch_date_index("peertube", "follow", 10000000000000000000000000) - - assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" - - latest_date = loader._fetch_latest_date("peertube", "follow") - assert loader._fetch_date_index("peertube", "follow", -1) == latest_date - - -def test_get_graph_errors(): - loader = GraphLoader() - - with pytest.raises(ValueError): - loader.get_graph("NON-EXISTING", "federation") - - with pytest.raises(ValueError): - loader.get_graph("peertube", "NON-EXISTING") - - with pytest.raises(ValueError): - loader.get_graph("peertube", "follow", date="20250203", index=3) - - -def _iter_software_graph(): - loader = GraphLoader() - for software, graph_types in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": - continue - for graph_type in graph_types: - if graph_type == "federation": - continue - yield software, graph_type - - -@pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) -def test_get_graph_selection(software, graph_type): - loader = GraphLoader() - - date = loader._fetch_latest_date(software, graph_type) - - # Test date selection - graph1 = loader.get_graph(software, graph_type, date=date) - - if not graph_type == "federation": # Because Federation is undirected - csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - records = loader.dataset.records(csv_file) - - assert graph1.number_of_edges() == len(list(records)) - - # Test index selection - graph2 = loader.get_graph(software, graph_type, index=-1) - assert graph1.number_of_edges() == graph2.number_of_edges() - - available_dates = loader.list_available_dates(software, graph_type) - date = available_dates[0] - graph3 = loader.get_graph(software, graph_type, date=date) - - graph4 = loader.get_graph(software, graph_type, index=0) - assert graph3.number_of_edges() == graph4.number_of_edges() - - -def _iter_software_graph_date(): - loader = GraphLoader() - for software, graph_types in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": - continue - for graph_type in graph_types: - if graph_type == "federation": - continue - for date in loader.list_available_dates(software, graph_type): - yield software, graph_type, date - - -@pytest.mark.parametrize("software,graph_type,date", list(_iter_software_graph_date())) -def test_get_graph_sizes(software, graph_type, date): - loader = GraphLoader() - - graph = loader.get_graph(software, graph_type, date=date) - csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - records = list(loader.dataset.records(csv_file)) - - assert graph.number_of_edges() == len(records) # Verify that we load all the edges - # NB: an error can also occur in case of data cleaning issue in the dataset - - -def test_graph_consistency(): - loader = GraphLoader() - - # Check graph consistency - peertube_graph = loader.get_graph("peertube", "follow", date="20250324") - assert peertube_graph.number_of_edges() == 19171 - assert peertube_graph.number_of_nodes() == 883 - - # Check node attributes - assert peertube_graph.nodes["aperi[DOT]tube"] == { - "domain": "tube", - "totalUsers": 39, - "totalDailyActiveUsers": 0.0, - "totalWeeklyActiveUsers": 4.0, - "totalMonthlyActiveUsers": 8.0, - "totalLocalVideos": 638, - "totalVideos": 1287, - "totalLocalPlaylists": 26.0, - "totalVideoComments": 4632, - "totalLocalVideoComments": 44, - "totalLocalVideoViews": 106216, - "serverVersion": "7.1.0", - } - - # Check largest component consistency - peertube_graph = loader.get_graph( # DIRECTED GRAPH - "peertube", "follow", date="20250324", only_largest_component=True - ) - assert peertube_graph.number_of_edges() == 7450 - assert peertube_graph.number_of_nodes() == 264 - - bookwyrm_graph = loader.get_graph( - "bookwyrm", "federation", date="20250324", only_largest_component=True - ) - assert bookwyrm_graph.number_of_nodes() == 70 - assert bookwyrm_graph.number_of_edges() == 1827 - - -def test_get_temporal_graph(): - loader = GraphLoader() - - with pytest.raises(ValueError): - loader.get_temporal_graph("NON-EXISTING", "federation") - - with pytest.raises(ValueError): - loader.get_temporal_graph("peertube", "NON-EXISTING") - - with pytest.raises(ValueError): - loader.get_temporal_graph( - "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) - ) - - with pytest.raises(ValueError): - loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) - - with pytest.raises(ValueError): - loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) - - with pytest.raises(ValueError): - loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) - - temporal_graph = loader.get_temporal_graph( - "peertube", "follow", date=("20250203", "20250617") - ) - assert len(temporal_graph.temporal_nodes()) == 1157 - assert len(temporal_graph.temporal_edges()) == 310695 - assert temporal_graph.number_of_snapshots() == 20 - - temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7)) - assert len(temporal_graph.temporal_nodes()) == 991 - assert len(temporal_graph.temporal_edges()) == 133852 - assert temporal_graph.number_of_snapshots() == 8