diff --git a/.gitignore b/.gitignore index dd2c92d..2258fb9 100644 --- a/.gitignore +++ b/.gitignore @@ -177,3 +177,6 @@ temp/ .envs/.local/.django start-dev.sh opencode.json + +*.log +!metrics/tests/fixtures/*.log \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 1f28439..0000000 --- a/AGENTS.md +++ /dev/null @@ -1,83 +0,0 @@ -# AGENTS.md - -## Project - -Django 5.2 + Wagtail 7.3 + Celery app that ingests SciELO access logs, validates them, and exports COUNTER-5 metrics to OpenSearch with monthly indices and daily nested metrics. - -## Key commands - -All commands run inside Docker via the `local.yml` compose file unless noted. - -```bash -make build # build images -make up # start all services (django, postgres, redis, celery worker+beat, mailhog) -make django_shell # Django shell via docker compose -make django_test # run full test suite (pytest) -make django_fast # tests with --failfast -make django_migrate # apply migrations -make django_makemigrations # generate new migrations -make django_createsuperuser # create Wagtail admin user -``` - -**Run a single test file/path:** -```bash -docker compose -f local.yml run --rm django pytest path/to/test_file.py -``` - -**Without Docker** (rare): use `start-dev.sh` after adjusting the ethernet interface name. - -## Architecture - -- **Wagtail admin**: `http://localhost:8009/admin` (NOT Django admin at `/django-admin/`) -- **Django apps** (top-level dirs): `core` (Wagtail pages, users, utilities, collectors), `collection`, `log_manager`, `log_manager_config`, `metrics`, `document`, `reports`, `resources`, `source`, `tracker`, `core_settings` -- **`core/`** contains utilities, shared models, Wagtail hooks, templates, and the `collectors/` subpackage. `config/` is the Django project package (settings, urls, celery_app, wsgi). -- **Celery pipeline**: `task_daily_log_ingestion_pipeline` (auto-scheduled) chains Search -> Validate -> Parse -> Export using Celery chords. Individual steps can be triggered manually via Wagtail admin. -- **Task names** use translatable strings, e.g. `_[Log Pipeline] 1. Search Logs (Manual)` — do not rename these casually, it breaks the schedule. - -## Settings - -- `DJANGO_SETTINGS_MODULE` defaults to `config.settings.local` -- Tests use `config.settings.test` (set via `pytest.ini` `--ds=config.settings.test`) -- Env files live in `.envs/.local/` (local) and `.envs/.production/` (production) -- **`config/settings/test.py`** is minimal — it extends `base.py` and does NOT load local.py. If a test needs a setting that only exists in local.py, it must be added to test.py or set in the test directly. - -## Testing - -- Framework: **pytest** (not Django's `TestCase` runner), with `--reuse-db` by default -- Config: `pytest.ini` sets `--ds=config.settings.test --reuse-db` -- Both `unittest.TestCase` (Django-style) and pytest-style tests coexist; `pytest` is the runner -- CI runs: `build -> makemigrations -> migrate -> pytest` -- Shared fixtures in `core/conftest.py` (autouse `media_storage`, `user` fixture via factory-boy) - -## Linting & formatting - -- **black** (line length 120 implied by flake8 config; black defaults to 88 — pre-commit config pins it) -- **isort** (black profile via `line_length=88`) -- **flake8** (max-line-length=120 via setup.cfg) -- Pre-commit runs all three on commit. Configuration in `setup.cfg` (flake8, isort, mypy) and `.pre-commit-config.yaml`. - -## Local dev quirks - -- Two SciELO libs (`scielo_log_validator`, `scielo_usage_counter`) are installed from local repos mounted at `/app/scielo_log_validator` and `/app/scielo_usage_counter` when `USE_LOCAL_SCIELO_LIBS=1`. The local Dockerfile strips these from `base.txt` during build and installs them from the mounted volumes via the entrypoint script. -- Log files volume: `/mnt/pidata2/pi/scl/logs:/app/logs` (host-specific, may not exist on all machines) -- Mailhog UI at `http://localhost:8029` -- `manage.py` appends `core/` to `sys.path` so `from core.utils import ...` and `from utils import ...` both resolve. - -## OpenSearch - -- Client configured via `OPENSEARCH_URL`, `OPENSEARCH_BASIC_AUTH`, `OPENSEARCH_VERIFY_CERTS` -- Index naming: `usage_monthly_{collection}_{year}` (e.g. `usage_monthly_books_2026`) -- Upserts use Painless scripts for idempotent daily metric merging -- `OPENSEARCH_INDEX_NAME` (default `usage`) and `OPENSEARCH_API_KEY` are defined in base settings but not widely used - -## MCP tools - -- When you need to search framework/library docs (Django, Wagtail, Celery, OpenSearch, etc.), use `context7` tools. -- When you need to find code examples or patterns from open-source projects, use `gh_grep` tools. - -## Wagtail-specific notes - -- Multi-language: `pt-br` (default), `en`, `es` -- Wagtail URL prefixes disabled (`prefix_default_language=False`) -- After adding a language, run `make wagtail_sync` and `make wagtail_update_translation_field` -- `wagtail-modeladmin` is used for managing pipeline entities in admin diff --git a/Makefile b/Makefile index 978625e..acf5129 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,9 @@ -default: build +default: help COMPOSE_FILE_DEV = local.yml compose = ${COMPOSE_FILE_DEV} -export SCIELO_USAGE_BUILD_DATE=$(shell date -u +"%Y-%m-%dT%H:%M:%SZ") -export SCIELO_USAGE_VCS_REF=$(strip $(shell git rev-parse --short HEAD)) export SCIELO_USAGE_WEBAPP_VERSION=$(strip $(shell cat VERSION)) help: ## Show this help @@ -18,23 +16,12 @@ help: ## Show this help @egrep '^(.+)\:\ .*##\ (.+)' ${MAKEFILE_LIST} | sed 's/:.*##/#/' | column -t -c 1 -s "#" @echo '' @echo 'Example:' - @echo "\t Type 'make' (default target=build) is the same of type 'make build compose=local.yml'" @echo "\t Type 'make build' is the same of type 'make build compose=local.yml'" @echo "\t Type 'make up' is the same of type 'make up compose=local.yml'" app_version: ## Show version of webapp @echo "Version: " $(SCIELO_USAGE_WEBAPP_VERSION) -latest_commit: ## Show last commit ref - @echo "Latest commit: " $(SCIELO_USAGE_VCS_REF) - -build_date: ## Show build date - @echo "Build date: " $(SCIELO_USAGE_BUILD_DATE) - -############################################ -## atalhos docker compose desenvolvimento ## -############################################ - build: ## Build app using $(compose) @docker compose -f $(compose) build @@ -50,80 +37,54 @@ logs: ## See all app logs using $(compose) stop: ## Stop all app using $(compose) @docker compose -f $(compose) stop -restart: +restart: ## Restart app using $(compose) @docker compose -f $(compose) restart - + ps: ## See all containers using $(compose) @docker compose -f $(compose) ps -rm: ## Remove all containers using $(compose) - @docker compose -f $(compose) rm -f +django_bash: ## Open a bash terminal from django container using $(compose) + @docker compose -f $(compose) run --rm django bash django_shell: ## Open python terminal from django $(compose) @docker compose -f $(compose) run --rm django python manage.py shell -wagtail_sync: ## Wagtail sync Page fields (repeat every time you add a new language and to update the wagtailcore_page translations) $(compose) - @docker compose -f $(compose) run --rm django python manage.py sync_page_translation_fields - -wagtail_update_translation_field: ## Wagtail update translation fields, user this command first $(compose) - @docker compose -f $(compose) run --rm django python manage.py update_translation_fields - django_createsuperuser: ## Create a super user from django $(compose) @docker compose -f $(compose) run --rm django python manage.py createsuperuser -django_bash: ## Open a bash terminar from django container using $(compose) - @docker compose -f $(compose) run --rm django bash - -django_test: ## Run tests from django container using $(compose) - @docker compose -f $(compose) run --rm django pytest - -django_fast: ## Run tests fast from django container using $(compose) - @docker compose -f $(compose) run --rm django pytest --failfast +django_migrate: ## Run migrate from django container using $(compose) + @docker compose -f $(compose) run --rm django python manage.py migrate django_makemigrations: ## Run makemigrations from django container using $(compose) @docker compose -f $(compose) run --rm django python manage.py makemigrations -django_migrate: ## Run migrate from django container using $(compose) - @docker compose -f $(compose) run --rm django python manage.py migrate - django_makemessages: ## Run ./manage.py makemessages $(compose) @docker compose -f $(compose) run --rm django python manage.py makemessages --all django_compilemessages: ## Run ./manage.py compilemessages $(compose) @docker compose -f $(compose) run --rm django python manage.py compilemessages -django_dump_auth: ## Run manage.py dumpdata auth --indent=2 $(compose) - @docker compose -f $(compose) run --rm django python manage.py dumpdata auth --indent=2 --output=fixtures/auth.json - -django_load_auth: ## Run manage.py dumpdata auth --indent=2 $(compose) - @docker compose -f $(compose) run --rm django python manage.py loaddata --database=default fixtures/auth.json - -dump_data: ## Dump database into .sql $(compose) - @docker compose -f $(compose) exec -T postgres sh -c 'pg_dumpall -c -U "$$POSTGRES_USER"' > dump_`date +%d-%m-%Y"_"%H_%M_%S`.sql - -restore_data: ## Restore database into from latest.sql file $(compose) - @docker compose -f $(compose) exec -T postgres sh -c 'psql -U "$$POSTGRES_USER"' < backup/latest.sql +wagtail_update_translation_field: ## Wagtail update translation fields, use this command first $(compose) + @docker compose -f $(compose) run --rm django python manage.py update_translation_fields -############################################ -## Atalhos Úteis ## -############################################ +wagtail_sync: ## Wagtail sync Page fields (repeat every time you add a new language and to update the wagtailcore_page translations) $(compose) + @docker compose -f $(compose) run --rm django python manage.py sync_page_translation_fields -clean_container: ## Remove all containers - @docker compose -f $(compose) rm -sf +test: ## Alias for django_test using $(compose) + @docker compose -f $(compose) run --rm django pytest -clean_dangling_images: ## Remove all dangling images - @docker rmi -f $$(docker images --filter 'dangling=true' -q --no-trunc) +django_test: ## Run tests from django container using $(compose) + @docker compose -f $(compose) run --rm django pytest -clean_dangling_volumes: ## Remove all dangling volumes - @docker volume rm $$(docker volume ls -f dangling=true -q) +django_fast: ## Run tests fast from django container using $(compose) + @docker compose -f $(compose) run --rm django pytest --failfast -clean_project_images: ## Remove all images with "scielo_usage" on name - @docker rmi -f $$(docker images --filter=reference='*scielo_usage*' -q) +lint: ## Run flake8 using $(compose) + @docker compose -f $(compose) run --rm django flake8 -volume_down: ## Remove all volume - @docker compose -f $(compose) down -v +format_check: ## Run black and isort checks using $(compose) + @docker compose -f $(compose) run --rm django black --check . + @docker compose -f $(compose) run --rm django isort --check-only . -clean_migrations: ## Remove generated migration bytecode only - @echo "Cleaning migration bytecode..." - @find . -path "*/migrations/*.pyc" -delete - @echo "Migration bytecode cleaned successfully." +precommit: ## Run pre-commit hooks using $(compose) + @docker compose -f $(compose) run --rm django pre-commit run --all-files diff --git a/README.md b/README.md index 2433fa8..f57d8b6 100644 --- a/README.md +++ b/README.md @@ -1,162 +1,196 @@ -# SciELO Usage Metrics Pipeline +# SciELO Usage -A modernized platform for processing and indexing SciELO usage logs into OpenSearch, adhering to COUNTER R5.1 standards. +[![CI](https://github.com/scieloorg/usage/actions/workflows/ci.yml/badge.svg)](https://github.com/scieloorg/usage/actions/workflows/ci.yml) +![Python](https://img.shields.io/badge/python-3.11-blue) +![Django](https://img.shields.io/badge/django-5.2-green) +![Wagtail](https://img.shields.io/badge/wagtail-7.3-teal) -## Quick Start (Dev Installation) +Application for processing SciELO access logs, extracting COUNTER R5.1 metrics, and exporting monthly/yearly usage documents to OpenSearch. -To build and run the application locally: +## Quick Start -1. `make build compose=local.yml` -2. `make django_migrate` -3. `make django_createsuperuser` -4. `make up` +Local development runs with Docker Compose using `local.yml`. -The application will be accessible at [http://localhost:8009/admin](http://localhost:8009/admin). +```bash +make build +make django_migrate +make django_createsuperuser +make up +``` + +Admin: http://localhost:8009/admin + +Main local services: ---- +| Service | Port | +|---|---:| +| Django/Wagtail | 8009 | +| PostgreSQL | 5439 | +| Redis | 6399 | +| Mailhog | 8029 | -## Key Commands +## Full Pipeline Setup -All commands run inside Docker via the `local.yml` compose file unless noted. +After the app is running, open a Django shell: ```bash -make build # build images -make up # start all services (django, postgres, redis, celery worker+beat, mailhog) -make django_shell # Django shell via docker compose -make django_test # run full test suite (pytest) -make django_fast # tests with --failfast -make django_migrate # apply migrations -make django_makemigrations # generate new migrations -make django_createsuperuser # create Wagtail admin user -make logs # follow all service logs -make ps # list compose services -make django_bash # open a bash shell in the django container -make django_compilemessages # compile translation files +make django_shell ``` -**Run a single test file/path:** +Seed the base data and resources: + +```python +from collection.tasks import task_load_collections +from log_manager_config.tasks import task_load_log_manager_collection_settings +from resources.tasks import task_load_geoip, task_load_robots + +log_config = [ + { + "acronym": "scl", + "directory_name": "SciELO Brasil", + "path": "/app/logs/scielo.br", + "quantity": 1, + "e-mail": "tecnologia@scielo.org", + "translator_class": "opac", + } +] + +task_load_collections.delay() +task_load_log_manager_collection_settings.delay(data=log_config) +task_load_robots.delay() +task_load_geoip.delay() +``` + +Load sources and documents before processing logs. For a first run, restrict document synchronization to a smaller date range: + +```python +from document.tasks import ( + task_load_dataset_metadata_into_documents, + task_load_documents_from_article_meta, + task_load_documents_from_opac, + task_load_preprints_into_documents, + task_sync_documents_from_scielo_books, +) +from source.tasks import ( + task_load_sources_from_article_meta, + task_load_sources_from_scielo_books, +) + +task_load_sources_from_article_meta.delay(collections=["scl"]) +task_load_sources_from_scielo_books.delay(limit=1000) + +date_range = {"from_date": "2025-01-01", "until_date": "2025-12-31"} +task_load_documents_from_article_meta.delay(**date_range) +task_load_documents_from_opac.delay(collection="scl", **date_range) +task_load_preprints_into_documents.delay(**date_range) +task_load_dataset_metadata_into_documents.delay(**date_range) +task_sync_documents_from_scielo_books.delay() +``` + +Before starting the log pipeline, confirm in the admin that each collection has an active Log Manager configuration pointing to a readable log directory mounted in the container. + +For the example above, place a log file under the configured directory: + ```bash -docker compose -f local.yml run --rm django pytest path/to/test_file.py +mkdir -p /scielo.br +cp metrics/tests/fixtures/usage.log /scielo.br/usage-2021-05-21.log ``` -## Architecture & Data Pipeline +Run the full Search -> Validate -> Parse -> Export chain for a date range: -### Apps +```python +from log_manager.tasks import task_search_log_files -| App | Purpose | -|---|---| -| `log_manager` | Log file discovery, validation, and status tracking | -| `log_manager_config` | Collection-specific configuration (paths, emails, expected logs/day) | -| `metrics` | Daily metric jobs, OpenSearch export, COUNTER R5.1 aggregation | -| `document` | Unified metadata model for articles, books, chapters, datasets, and preprints | -| `source` | Journal, book, preprint server, and data repository metadata | -| `reports` | Weekly, monthly, and yearly log processing reports | -| `resources` | Robot user-agent patterns and GeoIP MMDB management | -| `tracker` | Discarded line tracking and error logging | -| `core` | Wagtail pages, users, shared utilities, and external API collectors | -| `collection` | SciELO collection management | +task_search_log_files.delay( + collections=["scl"], + from_date="2021-05-21", + until_date="2021-05-21", + trigger_validation=True, +) +``` -### Core Collectors (`core/collectors/`) +Monitor execution with: -| Collector | Source | -|---|---| -| `articlemeta.py` | ArticleMeta REST/Thrift API | -| `opac.py` | SciELO OPAC endpoint | -| `preprints.py` | SciELO Preprints OAI-PMH | -| `dataverse.py` | SciELO Data (Dataverse) | -| `scielo_books.py` | SciELO Books CouchDB changes feed | +```bash +make logs +``` -### Log Ingestion Pipeline +## Commands -The ingestion is fully automated via the **`[Log Pipeline] Daily Routine (Auto)`** task. It follows a strictly ordered sequence using Celery Chords: +```bash +make help # list available targets +make app_version # show VERSION +make build # build local images +make build_no_cache # build local images without cache +make up # start local services +make logs # follow service logs +make stop # stop local services +make restart # restart local services +make ps # list running services +make django_bash # open bash in the django container +make django_shell # open Django shell +make django_createsuperuser # create an admin user +make django_migrate # apply migrations +make django_makemigrations # create migrations +make django_makemessages # update translation messages +make django_compilemessages # compile translation messages +make wagtail_update_translation_field +make wagtail_sync +make test # run pytest +make django_test # run pytest +make django_fast # run pytest --failfast +make lint # run flake8 +make format_check # run black/isort checks +make precommit # run pre-commit hooks +``` + +Use `compose=production.yml` or another Compose file when needed: + +```bash +make ps compose=production.yml +``` -- **Search**: Scans configured directories for new `.log` or `.gz` files. -- **Validate**: Performs statistical sampling to ensure log integrity and detect the usage date. -- **Parse**: Extracts metrics using `scielo_usage_counter`, performs URL translation, and aggregates data. -- **Export**: Pushes results to OpenSearch using idempotent upsert scripts. +Run one test path: -### Metadata Synchronization +```bash +docker compose -f local.yml run --rm django pytest metrics/tests/test_opensearch.py +``` -Metadata is kept in sync with SciELO sources (ArticleMeta, OPAC, Books, etc.) via the **`[Metadata] Daily Sync Routine (Auto)`** task, which runs parallel workers to ensure documents and sources are always up to date. +## Pipeline -## Supported Log Formats +The log pipeline is coordinated by Celery tasks: -| Format | Description | -|---|---| -| NCSA Extended | Standard Apache combined log format with optional domain prefix and IP list fields. | -| BunnyCDN | Pipe-delimited format with Unix timestamps (7 or 10 digits), country codes, and request IDs. | +1. Search configured directories for new `.log` and `.gz` files. +2. Validate log samples and detect usage date. +3. Parse requests with `scielo_usage_counter`. +4. Aggregate COUNTER R5.1 metrics. +5. Export idempotent monthly/yearly documents to OpenSearch. -## Environment Variables +Metadata synchronization keeps sources and documents updated from ArticleMeta, OPAC, SciELO Books, SciELO Preprints, and SciELO Data. -Runtime configuration is loaded from `.envs/.local/` or `.envs/.production/` through the Compose files. +## Periodic Tasks -### Core Services +Configure the default schedule manually in Wagtail/Admin through `django-celery-beat` +`PeriodicTask` records. Exact cron times may vary by installation, but the default +operational setup should include: -| Variable | Default | Description | +| Task | Suggested schedule | Notes | |---|---|---| -| `OPENSEARCH_URL` | `http://localhost:9200/` | OpenSearch cluster URL | -| `OPENSEARCH_INDEX_NAME` | `usage` | OpenSearch index prefix | -| `OPENSEARCH_BASIC_AUTH` | `admin:admin` | OpenSearch basic auth credentials | -| `OPENSEARCH_VERIFY_CERTS` | `False` | Verify SSL certificates for OpenSearch connections | -| `COUNTER_ROBOTS_URL` | `https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json` | COUNTER robot user-agent list URL used by the resources loader | -| `MMDB_URL_TEMPLATE` | `https://download.db-ip.com/free/dbip-country-lite-{year}-{month:02d}.mmdb.gz` | DB-IP GeoIP MMDB gzip URL template; `{year}` and `{month}` are filled from the current and previous month | -| `USE_LOCAL_SCIELO_LIBS` | `0` | Mount local `scielo_log_validator` and `scielo_usage_counter` repos for development | -| `DJANGO_SETTINGS_MODULE` | `config.settings.local` | Django settings module | -| `REDIS_URL` | — | Redis connection URL for Celery | - -### Collector Endpoints - -| Variable | Default | Description | +| `[Metadata] Daily Sync Routine (Auto)` | Daily, early morning | Refreshes sources and documents before log processing. Use the `load` queue. | +| `[Log Pipeline] Daily Routine (Auto)` | Daily, after metadata sync | Runs Search -> Validate -> Parse -> Export for new logs. Use the `load` queue. | +| `[Metrics] Resume Log Exports` | Every 15-30 minutes | Retries errored or stale daily metric export jobs. | +| `[Metrics] Resume Stale Parsing Logs` | Every 30-60 minutes | Marks stale `PAR` logs for retry. | +| `[Metrics] Cleanup Daily Payloads` | Daily or weekly | Removes old exported daily payload files. | +| `[Reports] Populate All Reports` | Daily, after log processing | Refreshes weekly, monthly, and yearly log report tables. | + +Optional operational tasks: + +| Task | Suggested schedule | Notes | |---|---|---| -| `ARTICLEMETA_COLLECT_URL` | `http://articlemeta.scielo.org/api/v1/article/counter_dict` | ArticleMeta counter metadata endpoint | -| `ARTICLEMETA_MAX_RETRIES` | `5` | ArticleMeta retry attempts | -| `ARTICLEMETA_SLEEP_TIME` | `30` | Delay between ArticleMeta retries, in seconds | -| `OPAC_ENDPOINT` | `https://www.scielo.br/api/v1/counter_dict` | OPAC counter metadata endpoint | -| `OPAC_MAX_RETRIES` | `5` | OPAC retry attempts | -| `OPAC_SLEEP_TIME` | `30` | Delay between OPAC retries, in seconds | -| `OAI_PMH_PREPRINT_ENDPOINT` | `https://preprints.scielo.org/index.php/scielo/oai` | SciELO Preprints OAI-PMH endpoint | -| `OAI_METADATA_PREFIX` | `oai_dc` | OAI-PMH metadata prefix | -| `OAI_PMH_MAX_RETRIES` | `5` | OAI-PMH retry attempts | -| `DATAVERSE_ENDPOINT` | `https://data.scielo.org/api` | SciELO Data Dataverse API endpoint | -| `DATAVERSE_ROOT_COLLECTION` | `scielodata` | Dataverse root collection alias | -| `DATAVERSE_SLEEP_TIME` | `30` | Dataverse request timeout/retry delay, in seconds | -| `SCIELO_BOOKS_BASE_URL` | `http://localhost:5984` | SciELO Books CouchDB base URL | -| `SCIELO_BOOKS_DB_NAME` | `scielobooks_1a` | SciELO Books CouchDB database name | -| `SCIELO_BOOKS_TIMEOUT` | `60` | SciELO Books request timeout, in seconds | -| `SCIELO_BOOKS_LIMIT` | `1000` | SciELO Books changes-feed page size | - -## OpenSearch Storage Strategy - -The OpenSearch export keeps monthly usage documents with nested daily metrics, while index names depend on collection size: - -- **Large and xlarge collections**: annual indices, such as `usage_monthly_scl_2024` and `usage_yearly_scl_2024`. -- **Small collections**: stable collection indices, such as `usage_monthly_books` and `usage_yearly_books`. -- **One Document per Month**: Each document/PID has one monthly document per metric scope. -- **Daily Nested Metrics**: Daily granularity is preserved inside each monthly document using a `daily_metrics` object. -- **Atomic Upserts**: Data is merged using OpenSearch **Painless Scripts**, allowing multiple logs for the same day/month to be processed without data duplication or loss. - -## Management & Monitoring - -All pipelines can be monitored through the **Wagtail Admin**: - -- **Log Manager**: Monitor the status of individual log files (`QUEUED`, `PARSING`, `PROCESSED`). -- **Daily Metric Jobs**: Track the history of daily processing and OpenSearch export attempts. -- **Log Config**: Manage collection-specific settings, log paths, and notification emails. - -Internally, log file statuses are stored as short codes such as `QUE`, `PAR`, and `PRO`, with labels displayed in the admin. - -### Useful Commands - -- `make django_shell`: Access the Django interactive shell. -- `make django_bash`: Open a bash shell in the Django container. -- `make logs`: Follow Docker Compose logs. -- `make ps`: Show running services. -- `docker compose -f local.yml run --rm django pytest path/to/test_file.py`: Run a single test file or path. -- `docker logs -f scielo_usage_local_celeryworker`: Monitor real-time task execution. - -## Dependencies - -- [scielo_log_validator](https://github.com/scieloorg/scielo_log_validator) — log file validation -- [scielo_usage_counter](https://github.com/scieloorg/scielo_usage_counter) — COUNTER R5.1 metrics extraction -- [device_detector](https://github.com/thinkwelltwd/device_detector) — client name/version detection -- [opensearch-py](https://github.com/opensearch-project/opensearch-py) — OpenSearch client +| `[Reports] Generate Log Report Summary (Manual)` | Manual or scheduled as needed | Sends summary emails using configured collection contacts. | +| `[Resources] Load Robots Data` | Weekly | Refreshes robots list used during parsing. | +| `[Resources] Load Geolocation Data` | Monthly | Refreshes GeoIP data used during parsing. | + +## Version + +Project release version is stored in `VERSION`. diff --git a/VERSION b/VERSION index 50ffc5a..7ec1d6d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.3 +2.1.0 diff --git a/collection/admin.py b/collection/admin.py index 8c38f3f..846f6b4 100644 --- a/collection/admin.py +++ b/collection/admin.py @@ -1,3 +1 @@ -from django.contrib import admin - # Register your models here. diff --git a/collection/exceptions.py b/collection/exceptions.py index 62ce062..e25d988 100644 --- a/collection/exceptions.py +++ b/collection/exceptions.py @@ -1,2 +1,2 @@ class MainCollectionNotFoundError(Exception): - ... \ No newline at end of file + ... diff --git a/collection/models.py b/collection/models.py index 87da123..b26dd16 100644 --- a/collection/models.py +++ b/collection/models.py @@ -11,7 +11,7 @@ from core.models import CommonControlField, Language, TextWithLang from core.utils.request_utils import fetch_data -from . import choices +from collection import choices class CollectionName(TextWithLang): @@ -97,7 +97,7 @@ def autocomplete_label(self): ] class Meta: - ordering = ['main_name'] + ordering = ["main_name"] verbose_name = _("Collection") verbose_name_plural = _("Collections") indexes = [ @@ -237,7 +237,7 @@ def name(self): @classmethod def acron2_list(self): return [col.acron2 for col in Collection.objects.iterator()] - + @classmethod def acron3_list(self): return [col.acron3 for col in Collection.objects.iterator()] diff --git a/collection/tasks.py b/collection/tasks.py index 221e8bc..303ecf8 100644 --- a/collection/tasks.py +++ b/collection/tasks.py @@ -1,13 +1,13 @@ from django.contrib.auth import get_user_model -from core.utils.request_utils import _get_user from collection.models import Collection from config import celery_app +from core.utils.request_utils import _get_user User = get_user_model() -@celery_app.task(bind=True, name='[Collection] Load Collection Data') +@celery_app.task(bind=True, name="[Collection] Load Collection Data") def task_load_collections(self, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) Collection.load(user) diff --git a/collection/tests.py b/collection/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/collection/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/collection/views.py b/collection/views.py index 91ea44a..60f00ef 100644 --- a/collection/views.py +++ b/collection/views.py @@ -1,3 +1 @@ -from django.shortcuts import render - # Create your views here. diff --git a/collection/wagtail_hooks.py b/collection/wagtail_hooks.py index 018dab8..e556930 100644 --- a/collection/wagtail_hooks.py +++ b/collection/wagtail_hooks.py @@ -5,7 +5,8 @@ from config.menu import get_menu_order from document.wagtail_hooks import DocumentSnippetViewSet from source.wagtail_hooks import SourceSnippetViewSet -from .models import Collection + +from collection.models import Collection class CollectionSnippetViewSet(SnippetViewSet): diff --git a/config/collections.py b/config/collections.py index 9aa3efe..7249f5d 100644 --- a/config/collections.py +++ b/config/collections.py @@ -33,31 +33,247 @@ "xlarge": 0.1, } + +def get_collection_size(collection_acronym): + return COLLECTION_ACRON3_SIZE_MAP.get(collection_acronym, "small") + + +def get_collection_parse_queue(collection_acronym): + return f"parse_{get_collection_size(collection_acronym)}" + LOG_MANAGER_SEED_DATA = [ - {"acronym": "arg", "directory_name": "Site clássico", "path": "/app/logs/scielo.ar", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "bol", "directory_name": "Site clássico", "path": "/app/logs/scielo.bo", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "chl", "directory_name": "Site clássico", "path": "/app/logs/scielo.cl", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "col", "directory_name": "Site clássico", "path": "/app/logs/scielo.co", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "cri", "directory_name": "Site clássico", "path": "/app/logs/scielo.cr", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "cub", "directory_name": "Site clássico", "path": "/app/logs/scielo.cu", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "data", "directory_name": "Site clássico", "path": "/app/logs/dataverse", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "dataverse"}, - {"acronym": "dom", "directory_name": "Site novo", "path": "/app/logs/scielo.dom", "quantity": 1, "start_date": "2026-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac"}, - {"acronym": "ecu", "directory_name": "Site clássico", "path": "/app/logs/scielo.ec", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "esp", "directory_name": "Site clássico", "path": "/app/logs/scielo.es", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "mex", "directory_name": "Site clássico", "path": "/app/logs/scielo.mx", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "per", "directory_name": "Site clássico", "path": "/app/logs/scielo.pe", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "preprints", "directory_name": "Site clássico", "path": "/app/logs/submission-node01", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "preprints"}, - {"acronym": "prt", "directory_name": "Site clássico", "path": "/app/logs/scielo.pt", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "pry", "directory_name": "Site clássico", "path": "/app/logs/scielo.py", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "psi", "directory_name": "Site clássico", "path": "/app/logs/scielo.pepsic", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "rve", "directory_name": "Site clássico", "path": "/app/logs/scielo.revenf", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "rvt", "directory_name": "Site clássico", "path": "/app/logs/scielo.revtur", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "scl", "directory_name": "Site novo", "path": "/app/logs/scielo.br", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac"}, - {"acronym": "spa", "directory_name": "Site novo - versão prévia", "path": "/app/logs/scielo.sp", "quantity": 2, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac_alpha"}, - {"acronym": "sss", "directory_name": "Site clássico", "path": "/app/logs/scielo.ss", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "sza", "directory_name": "Site clássico", "path": "/app/logs/scielo.za", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "ury", "directory_name": "Site clássico", "path": "/app/logs/scielo.uy", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "ven", "directory_name": "Site clássico", "path": "/app/logs/scielo.ve", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "wid", "directory_name": "Site clássico", "path": "/app/logs/scielo.wi", "quantity": 2, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, - {"acronym": "books", "directory_name": "SciELO Books", "path": "/app/logs/books", "quantity": 1, "start_date": "2012-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "books"}, + { + "acronym": "arg", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.ar", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "bol", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.bo", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "chl", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.cl", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "col", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.co", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "cri", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.cr", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "cub", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.cu", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "data", + "directory_name": "Site clássico", + "path": "/app/logs/dataverse", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "dataverse", + }, + { + "acronym": "dom", + "directory_name": "Site novo", + "path": "/app/logs/scielo.dom", + "quantity": 1, + "start_date": "2026-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "opac", + }, + { + "acronym": "ecu", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.ec", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "esp", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.es", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "mex", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.mx", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "per", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.pe", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "preprints", + "directory_name": "Site clássico", + "path": "/app/logs/submission-node01", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "preprints", + }, + { + "acronym": "prt", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.pt", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "pry", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.py", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "psi", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.pepsic", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "rve", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.revenf", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "rvt", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.revtur", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "scl", + "directory_name": "Site novo", + "path": "/app/logs/scielo.br", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "opac", + }, + { + "acronym": "spa", + "directory_name": "Site novo - versão prévia", + "path": "/app/logs/scielo.sp", + "quantity": 2, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "opac_alpha", + }, + { + "acronym": "sss", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.ss", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "sza", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.za", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "ury", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.uy", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "ven", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.ve", + "quantity": 1, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "wid", + "directory_name": "Site clássico", + "path": "/app/logs/scielo.wi", + "quantity": 2, + "start_date": "2020-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "classic", + }, + { + "acronym": "books", + "directory_name": "SciELO Books", + "path": "/app/logs/books", + "quantity": 1, + "start_date": "2012-01-01", + "e-mail": "tecnologia@scielo.org", + "translator_class": "books", + }, ] diff --git a/config/menu.py b/config/menu.py index 844ce0c..f1429e8 100644 --- a/config/menu.py +++ b/config/menu.py @@ -7,8 +7,9 @@ "tasks": 600, } + def get_menu_order(app_name): try: return WAGTAIL_MENU_APPS_ORDER[app_name] - except: + except KeyError: return 950 diff --git a/config/settings/base.py b/config/settings/base.py index 62aa17a..0b67885 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -319,6 +319,20 @@ CELERY_RESULT_BACKEND = CELERY_BROKER_URL # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-accept_content CELERY_ACCEPT_CONTENT = ["json"] +# Import nested task packages that are not exposed through package __init__.py files. +CELERY_IMPORTS = ( + "document.tasks.articlemeta", + "document.tasks.dataverse", + "document.tasks.opac", + "document.tasks.pipeline", + "document.tasks.preprints", + "document.tasks.scielo_books", + "metrics.tasks.cleanup", + "metrics.tasks.daily_metric_exports", + "metrics.tasks.index", + "metrics.tasks.log_parsing", + "metrics.tasks.resume", +) # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-task_serializer CELERY_TASK_SERIALIZER = "json" # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-result_serializer diff --git a/config/urls.py b/config/urls.py index 73ecd86..91bb5a4 100644 --- a/config/urls.py +++ b/config/urls.py @@ -9,7 +9,6 @@ from wagtail.documents import urls as wagtaildocs_urls from wagtailautocomplete.urls.admin import urlpatterns as autocomplete_admin_urls - urlpatterns = [ path("admin/autocomplete/", include(autocomplete_admin_urls)), path(settings.DJANGO_ADMIN_URL, admin.site.urls), diff --git a/core/__init__.py b/core/__init__.py index e1d8615..e69de29 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -1,7 +0,0 @@ -__version__ = "0.1.0" -__version_info__ = tuple( - [ - int(num) if num.isdigit() else num - for num in __version__.replace("-", ".", 1).split(".") - ] -) diff --git a/core/collectors/__init__.py b/core/collectors/__init__.py index 8b13789..e69de29 100644 --- a/core/collectors/__init__.py +++ b/core/collectors/__init__.py @@ -1 +0,0 @@ - diff --git a/core/collectors/articlemeta.py b/core/collectors/articlemeta.py index 7f6ace0..b544827 100644 --- a/core/collectors/articlemeta.py +++ b/core/collectors/articlemeta.py @@ -1,9 +1,9 @@ import logging +from time import sleep import requests -from django.conf import settings from articlemeta.client import RestfulClient, ThriftClient -from time import sleep +from django.conf import settings def fetch_article_counter_dict( diff --git a/core/collectors/opac.py b/core/collectors/opac.py index 94122b7..5771453 100644 --- a/core/collectors/opac.py +++ b/core/collectors/opac.py @@ -1,8 +1,8 @@ import logging +from time import sleep import requests from django.conf import settings -from time import sleep def fetch_counter_dict(from_date, until_date, page=1): diff --git a/core/collectors/scielo_books.py b/core/collectors/scielo_books.py index b1f2dd8..87595ba 100644 --- a/core/collectors/scielo_books.py +++ b/core/collectors/scielo_books.py @@ -1,10 +1,8 @@ import logging +from urllib.parse import urlencode import requests from django.conf import settings -from urllib.parse import urlencode - - def build_url(base_url, params=None): @@ -33,7 +31,9 @@ def fetch_document(doc_id, base_url=None, db_name=None, headers=None): raise ValueError("SCIELO_BOOKS_BASE_URL is not configured") url = f"{resolved_base_url}/{db_name}/{doc_id}" - response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False) + response = requests.get( + url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False + ) response.raise_for_status() payload = response.json() return sanitize_raw_data(payload), url @@ -62,7 +62,9 @@ def fetch_changes_page( params["include_docs"] = "true" url = build_url(f"{resolved_base_url}/{db_name}/_changes", params) - response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False) + response = requests.get( + url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False + ) response.raise_for_status() payload = response.json() return payload if isinstance(payload, dict) else {} diff --git a/core/home/models.py b/core/home/models.py index 1734b90..d2ce8dc 100644 --- a/core/home/models.py +++ b/core/home/models.py @@ -13,6 +13,7 @@ class HomePage(Page): pass + class FormField(AbstractFormField): page = ParentalKey("FormPage", on_delete=models.CASCADE, related_name="form_fields") @@ -45,7 +46,10 @@ def serve(self, request, *args, **kwargs): return JsonResponse( { "alert": "error", - "message": "Erro ao tentar enviar a formulário! Verifique os campos obrigatórios. Errors: %s" + "message": ( + "Erro ao tentar enviar a formulário! " + "Verifique os campos obrigatórios. Errors: %s" + ) % form.errors, } ) diff --git a/core/models.py b/core/models.py index 2a4ecbf..346b774 100644 --- a/core/models.py +++ b/core/models.py @@ -1,8 +1,8 @@ import os from django.contrib.auth import get_user_model -from django.db import models, IntegrityError -from django.db.models import Case, When, Value, IntegerField +from django.db import IntegrityError, models +from django.db.models import Case, IntegerField, Value, When from django.utils.translation import gettext as _ from wagtail.admin.panels import FieldPanel from wagtail.fields import RichTextField @@ -225,17 +225,20 @@ def get_object_in_preferred_language(self, language): mission = self.filter(language=language) if mission: return mission - - language_order = ['pt', 'es', 'en'] + + language_order = ["pt", "es", "en"] langs = self.all().values_list("language", flat=True) languages = Language.objects.filter(id__in=langs) - + # Define a ordem baseado na lista language_order - order = [When(code2=lang, then=Value(i)) for i, lang in enumerate(language_order)] + order = [ + When(code2=lang, then=Value(i)) for i, lang in enumerate(language_order) + ] ordered_languages = languages.annotate( - language_order=Case(*order, default=Value(len(language_order)), output_field=IntegerField()) - ).order_by('language_order') - + language_order=Case( + *order, default=Value(len(language_order)), output_field=IntegerField() + ) + ).order_by("language_order") for lang in ordered_languages: mission = self.filter(language=lang) @@ -258,7 +261,7 @@ class RichTextWithLanguage(models.Model): AutocompletePanel("language"), FieldPanel("rich_text"), ] - + objects = LanguageFallbackManager() class Meta: @@ -298,7 +301,7 @@ def autocomplete_label(self): ] class Meta: - unique_together = [("license_type", )] + unique_together = [("license_type",)] verbose_name = _("License") verbose_name_plural = _("Licenses") indexes = [ @@ -327,9 +330,7 @@ def get( ): if not license_type: raise ValueError("License.get requires license_type parameters") - filters = dict( - license_type__iexact=license_type - ) + filters = dict(license_type__iexact=license_type) try: return cls.objects.get(**filters) except cls.MultipleObjectsReturned: @@ -369,7 +370,8 @@ class LicenseStatement(CommonControlField): Language, on_delete=models.SET_NULL, null=True, blank=True ) license = models.ForeignKey( - License, on_delete=models.SET_NULL, null=True, blank=True) + License, on_delete=models.SET_NULL, null=True, blank=True + ) panels = [ FieldPanel("url"), @@ -407,7 +409,8 @@ def get( raise ValueError("LicenseStatement.get requires url or license_p") try: return cls.objects.get( - url__iexact=url, license_p__iexact=license_p, language=language) + url__iexact=url, license_p__iexact=license_p, language=language + ) except cls.MultipleObjectsReturned: return cls.objects.filter( url__iexact=url, license_p__iexact=license_p, language=language @@ -448,9 +451,7 @@ def create_or_update( ): try: data = dict( - url=url, - license_p=license_p, - language=language and language.code2 + url=url, license_p=license_p, language=language and language.code2 ) try: obj = cls.get(url, license_p, language) @@ -465,7 +466,9 @@ def create_or_update( except cls.DoesNotExist: return cls.create(user, url, license_p, language, license) except Exception as e: - raise ValueError(f"Unable to create or update LicenseStatement for {data}: {type(e)} {e}") + raise ValueError( + f"Unable to create or update LicenseStatement for {data}: {type(e)} {e}" + ) @staticmethod def parse_url(url): @@ -514,7 +517,7 @@ class FileWithLang(models.Model): blank=True, on_delete=models.SET_NULL, verbose_name=_("File"), - help_text='', + help_text="", related_name="+", ) diff --git a/core/tests/tests_collectors.py b/core/tests/tests_collectors.py index 6d13a7c..f595da3 100644 --- a/core/tests/tests_collectors.py +++ b/core/tests/tests_collectors.py @@ -28,14 +28,20 @@ def test_extract_last_seq_accepts_both_couch_formats(self): @patch("core.collectors.scielo_books.fetch_document") @patch("core.collectors.scielo_books.fetch_changes_page") - def test_iter_change_documents_uses_docs_from_changes_payload(self, mock_fetch_changes_page, mock_fetch_document): + def test_iter_change_documents_uses_docs_from_changes_payload( + self, mock_fetch_changes_page, mock_fetch_document + ): mock_fetch_changes_page.side_effect = [ { "results": [ { "seq": 10, "id": "book1", - "doc": {"_id": "book1", "TYPE": "Monograph", "title": "Book One"}, + "doc": { + "_id": "book1", + "TYPE": "Monograph", + "title": "Book One", + }, } ], "last_seq": 10, @@ -43,7 +49,11 @@ def test_iter_change_documents_uses_docs_from_changes_payload(self, mock_fetch_c {"results": [], "last_seq": 10}, ] - results = list(scielo_books.iter_change_documents(base_url="https://books.example", db_name="scielobooks_1a")) + results = list( + scielo_books.iter_change_documents( + base_url="https://books.example", db_name="scielobooks_1a" + ) + ) self.assertEqual(len(results), 1) self.assertEqual(results[0]["payload"]["id"], "book1") diff --git a/core/tests/tests_date_utils.py b/core/tests/tests_date_utils.py index 8d4f9b6..9e12869 100644 --- a/core/tests/tests_date_utils.py +++ b/core/tests/tests_date_utils.py @@ -10,27 +10,20 @@ class DateUtilsTests(TestCase): - def test_get_date_range_with_valid_dates(self): from_date = "2023-01-01" until_date = "2023-01-31" result = get_date_range_str(from_date_str=from_date, until_date_str=until_date) - expected = ( - '2023-01-01', - '2023-01-31' - ) + expected = ("2023-01-01", "2023-01-31") self.assertEqual(result, expected) def test_get_date_range_with_invalid_from_date(self): from_date = "invalid-date" until_date = "2023-01-10" result = get_date_range_str(from_date_str=from_date, until_date_str=until_date) - - expected = ( - '2023-01-03', - '2023-01-10' - ) + + expected = ("2023-01-03", "2023-01-10") self.assertEqual(result, expected) def test_get_date_range_with_invalid_until_date(self): @@ -38,10 +31,7 @@ def test_get_date_range_with_invalid_until_date(self): until_date = "invalid-date" result = get_date_range_str(from_date_str=from_date, until_date_str=until_date) - expected = ( - '2024-05-20', - '2024-05-27' - ) + expected = ("2024-05-20", "2024-05-27") self.assertEqual(result, expected) def test_get_date_range_with_days_to_go_back(self): @@ -51,7 +41,7 @@ def test_get_date_range_with_days_to_go_back(self): expected = ( (today - timedelta(days=days_to_go_back)).strftime("%Y-%m-%d"), - today.strftime("%Y-%m-%d") + today.strftime("%Y-%m-%d"), ) self.assertEqual(result, expected) @@ -59,7 +49,7 @@ def test_get_date_range_with_no_params(self): result = get_date_range_str() expected = ( (datetime.now().date() - timedelta(days=7)).strftime("%Y-%m-%d"), - datetime.now().date().strftime("%Y-%m-%d") + datetime.now().date().strftime("%Y-%m-%d"), ) self.assertEqual(result, expected) @@ -67,8 +57,8 @@ def test_get_date_range_with_only_from_date(self): from_date = "2025-02-01" result = get_date_range_str(from_date_str=from_date) expected = ( - '2025-02-01', - '2025-02-08', + "2025-02-01", + "2025-02-08", ) self.assertEqual(result, expected) @@ -76,15 +66,15 @@ def test_get_date_range_with_only_until_date(self): until_date = "2025-02-08" result = get_date_range_str(until_date_str=until_date) expected = ( - '2025-02-01', - '2025-02-08', + "2025-02-01", + "2025-02-08", ) self.assertEqual(result, expected) def test_extract_minute_second_key(self): dt = datetime(2023, 3, 15, 14, 30, 45) key = extract_minute_second_key(dt) - self.assertEqual(key, '30:45') + self.assertEqual(key, "30:45") def test_extract_minute_second_key_returns_none_for_invalid_datetime(self): self.assertIsNone(extract_minute_second_key(None)) diff --git a/core/users/admin.py b/core/users/admin.py index 48460ed..02d6a9d 100644 --- a/core/users/admin.py +++ b/core/users/admin.py @@ -3,7 +3,6 @@ from django.contrib.auth import get_user_model from django.utils.translation import gettext_lazy as _ - User = get_user_model() diff --git a/core/users/forms.py b/core/users/forms.py index 14faa58..6e1dd9d 100644 --- a/core/users/forms.py +++ b/core/users/forms.py @@ -4,7 +4,6 @@ from django.contrib.auth import get_user_model from django.utils.translation import gettext_lazy as _ - User = get_user_model() diff --git a/core/users/models.py b/core/users/models.py index 4d894f2..6a3e360 100644 --- a/core/users/models.py +++ b/core/users/models.py @@ -15,7 +15,7 @@ class User(AbstractUser): name = models.CharField(_("Name of User"), blank=True, max_length=255) first_name = models.CharField(max_length=150, blank=True, verbose_name="first name") last_name = models.CharField(max_length=150, blank=True, verbose_name="last name") - + def get_absolute_url(self): """Get url for user's detail view. diff --git a/core/users/tasks.py b/core/users/tasks.py index 7ee093a..39511e8 100644 --- a/core/users/tasks.py +++ b/core/users/tasks.py @@ -5,7 +5,7 @@ User = get_user_model() -@celery_app.task(bind=True, name='Get users count') +@celery_app.task(bind=True, name="Get users count") def get_users_count(self): """A pointless Celery task to demonstrate usage.""" return User.objects.count() diff --git a/core/users/tests/test_urls.py b/core/users/tests/test_urls.py index c393ced..3dcbdf2 100644 --- a/core/users/tests/test_urls.py +++ b/core/users/tests/test_urls.py @@ -1,5 +1,4 @@ import pytest - from django.urls import resolve, reverse from core.users.models import User diff --git a/core/users/views.py b/core/users/views.py index 42d187f..488c294 100644 --- a/core/users/views.py +++ b/core/users/views.py @@ -5,7 +5,6 @@ from django.utils.translation import gettext_lazy as _ from django.views.generic import DetailView, RedirectView, UpdateView - User = get_user_model() diff --git a/core/utils/csv_utils.py b/core/utils/csv_utils.py index 23d3949..3fa2da7 100644 --- a/core/utils/csv_utils.py +++ b/core/utils/csv_utils.py @@ -13,14 +13,16 @@ def get_load_data_function(file_path): Returns: function: The corresponding function to load data from the file. """ - if file_path.lower().endswith('.csv'): + if file_path.lower().endswith(".csv"): return load_csv - - if file_path.lower().endswith('.tar.gz') or ('.tar' in file_path.lower() and file_path.lower().endswith('.gz')): + + if file_path.lower().endswith(".tar.gz") or ( + ".tar" in file_path.lower() and file_path.lower().endswith(".gz") + ): return load_tar_gz -def load_csv(file_obj, delimiter='\t', is_stream=False): +def load_csv(file_obj, delimiter="\t", is_stream=False): """ Loads and processes a CSV file, yielding each row as a dictionary. @@ -33,16 +35,16 @@ def load_csv(file_obj, delimiter='\t', is_stream=False): dict: Each row of the CSV file as a dictionary. """ if is_stream: - file_obj = io.StringIO(file_obj.decode('utf-8')) + file_obj = io.StringIO(file_obj.decode("utf-8")) with file_obj if is_stream else open(file_obj) as fin: first_line = fin.readline().strip() if not first_line: return - + header = first_line.split(delimiter) reader = csv.DictReader( - fin, + fin, fieldnames=header, delimiter=delimiter, ) @@ -50,7 +52,7 @@ def load_csv(file_obj, delimiter='\t', is_stream=False): yield row -def load_tar_gz(file_path, delimiter='\t'): +def load_tar_gz(file_path, delimiter="\t"): """ Loads and processes CSV files from within a tar.gz archive, yielding each row as a dictionary. @@ -61,12 +63,8 @@ def load_tar_gz(file_path, delimiter='\t'): Yields: dict: Each row of each CSV file within the tar.gz archive as a dictionary. """ - with tarfile.open(file_path, 'r:gz') as tar: + with tarfile.open(file_path, "r:gz") as tar: for member in tar.getmembers(): - if member.isfile() and member.name.lower().endswith('.csv'): + if member.isfile() and member.name.lower().endswith(".csv"): file_content = tar.extractfile(member).read() - yield from load_csv( - file_content, - delimiter=delimiter, - is_stream=True - ) + yield from load_csv(file_content, delimiter=delimiter, is_stream=True) diff --git a/core/utils/date_utils.py b/core/utils/date_utils.py index f20ffea..4f3df0e 100644 --- a/core/utils/date_utils.py +++ b/core/utils/date_utils.py @@ -1,5 +1,4 @@ import logging - from datetime import datetime, timedelta @@ -32,12 +31,17 @@ def get_date_obj(date_str: str, format: str = "%Y-%m-%d") -> datetime.date: return None -def get_date_range_str(from_date_str: str = None, until_date_str: str = None, days_to_go_back: int = None) -> tuple[str, str]: +def get_date_range_str( + from_date_str: str = None, + until_date_str: str = None, + days_to_go_back: int = None, +) -> tuple[str, str]: """ Get the date range to be used in the queries. If both from_date_str and until_date_str are provided, they will be used. - If only one is provided, it will be used as the start or end date, and the other will be calculated based on a 7-day range. + If only one is provided, it will be used as the start or end date, + and the other will be calculated based on a 7-day range. If neither is provided, the function will default to the last 7 days from today. If days_to_go_back is provided, it will override the from_date_str and until_date_str. @@ -52,7 +56,9 @@ def get_date_range_str(from_date_str: str = None, until_date_str: str = None, da today = datetime.now().date() if days_to_go_back: - return get_date_str(today - timedelta(days=days_to_go_back)), get_date_str(today) + return get_date_str(today - timedelta(days=days_to_go_back)), get_date_str( + today + ) from_date_obj = get_date_obj(from_date_str) until_date_obj = get_date_obj(until_date_str) @@ -65,7 +71,7 @@ def get_date_range_str(from_date_str: str = None, until_date_str: str = None, da if until_date_obj: return get_date_str(until_date_obj - timedelta(days=7)), until_date_str - + return get_date_str(today - timedelta(days=7)), get_date_str(today) @@ -73,12 +79,12 @@ def get_date_obj_from_timestamp(timestamp): return datetime.fromtimestamp(timestamp).date() -def get_date_objs_from_date_range(from_date, until_date, format='%Y-%m-%d'): +def get_date_objs_from_date_range(from_date, until_date, format="%Y-%m-%d"): visible_dates = [] if not isinstance(from_date, datetime): from_date = datetime.strptime(from_date, format).date() - + if not isinstance(until_date, datetime): until_date = datetime.strptime(until_date, format).date() @@ -131,7 +137,9 @@ def _coerce_datetime(dt): try: return datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") except ValueError: - logging.error("Invalid datetime string format. Expected '%Y-%m-%d %H:%M:%S'.") + logging.error( + "Invalid datetime string format. Expected '%Y-%m-%d %H:%M:%S'." + ) return None logging.error("Invalid datetime value: %r.", dt) diff --git a/core/utils/metadata.py b/core/utils/metadata.py new file mode 100644 index 0000000..01e78a5 --- /dev/null +++ b/core/utils/metadata.py @@ -0,0 +1,43 @@ +def as_list(value): + if not value: + return [] + + if isinstance(value, list): + return value + + return [value] + + +def compact_dict(data): + return { + key: value for key, value in data.items() if value not in (None, "", [], {}, ()) + } + + +def get_value(data, key, default=None): + if isinstance(data, dict): + return data.get(key, default) + return getattr(data, key, default) + + +def normalize_langs(value): + if not value: + return [] + + if isinstance(value, list): + return [item for item in value if item not in (None, "")] + + if isinstance(value, dict): + return [key for key, enabled in value.items() if enabled] + + return [value] + + +def normalize_year(value, fallback_date=None): + if value not in (None, ""): + return str(value)[:4] + + if fallback_date not in (None, ""): + return str(fallback_date)[:4] + + return None diff --git a/core/utils/request_utils.py b/core/utils/request_utils.py index c4fbec6..084cd46 100644 --- a/core/utils/request_utils.py +++ b/core/utils/request_utils.py @@ -1,15 +1,13 @@ import logging import requests +from django.contrib.auth import get_user_model from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) -from urllib3.util import Retry -from django.contrib.auth import get_user_model - logger = logging.getLogger(__name__) User = get_user_model() diff --git a/core/utils/standardizer.py b/core/utils/standardizer.py index c228bf5..bcd0cb2 100644 --- a/core/utils/standardizer.py +++ b/core/utils/standardizer.py @@ -46,7 +46,7 @@ def standardize_doi(text): ] for prefix in doi_prefixes: if text.lower().startswith(prefix): - text = text[len(prefix):] + text = text[len(prefix) :] break if text.lower().startswith("10."): @@ -75,3 +75,25 @@ def language_iso(code): if langcodes.tag_is_valid(code): return langcodes.standardize_tag(code) return "" + + +def standardize_or_default(func, value, default=""): + try: + return func(value) + except Exception: + return default + + +def standardize_pid_generic_values(values): + if not isinstance(values, (list, tuple, set)): + return [] + + items = [] + + for value in values: + item = standardize_or_default(standardize_pid_generic, value) + + if item and item not in items: + items.append(item) + + return items diff --git a/core/wagtail_hooks.py b/core/wagtail_hooks.py index e7da1eb..0604472 100644 --- a/core/wagtail_hooks.py +++ b/core/wagtail_hooks.py @@ -4,7 +4,6 @@ from django.utils.html import format_html from wagtail import hooks - HIDDEN_MAIN_MENU_ITEMS = { "documents", "explorer", diff --git a/core_settings/admin.py b/core_settings/admin.py index 8c38f3f..846f6b4 100644 --- a/core_settings/admin.py +++ b/core_settings/admin.py @@ -1,3 +1 @@ -from django.contrib import admin - # Register your models here. diff --git a/core_settings/tests.py b/core_settings/tests.py index 7ce503c..a39b155 100644 --- a/core_settings/tests.py +++ b/core_settings/tests.py @@ -1,3 +1 @@ -from django.test import TestCase - # Create your tests here. diff --git a/core_settings/views.py b/core_settings/views.py index 91ea44a..60f00ef 100644 --- a/core_settings/views.py +++ b/core_settings/views.py @@ -1,3 +1 @@ -from django.shortcuts import render - # Create your views here. diff --git a/django_celery_beat/models.py b/django_celery_beat/models.py index 466c16e..583a9aa 100644 --- a/django_celery_beat/models.py +++ b/django_celery_beat/models.py @@ -73,14 +73,15 @@ def crontab_schedule_celery_timezone(): except AttributeError: return "UTC" - # evita `AttributeError: type object 'TimeZoneField' has no attribute 'default_choices'` - return "UTC" - return ( - CELERY_TIMEZONE - if CELERY_TIMEZONE - in [choice[0].zone for choice in timezone_field.TimeZoneField.default_choices] - else "UTC" - ) + if not CELERY_TIMEZONE: + return "UTC" + + try: + timezone = timezone_field.TimeZoneField().to_python(CELERY_TIMEZONE) + except ValidationError: + return "UTC" + + return str(timezone) class SolarSchedule(models.Model): diff --git a/django_celery_beat/wagtail_hooks.py b/django_celery_beat/wagtail_hooks.py index 492a642..e8c1994 100644 --- a/django_celery_beat/wagtail_hooks.py +++ b/django_celery_beat/wagtail_hooks.py @@ -7,12 +7,9 @@ from django.utils.translation import gettext_lazy as _ from kombu.utils.json import loads from wagtail import hooks -from wagtail_modeladmin.options import ( - ModelAdmin, - ModelAdminGroup, - modeladmin_register, -) +from wagtail_modeladmin.options import ModelAdmin, ModelAdminGroup, modeladmin_register +from config.menu import get_menu_order from django_celery_beat.models import ( ClockedSchedule, CrontabSchedule, @@ -23,8 +20,6 @@ ) from django_celery_beat.utils import is_database_scheduler -from config.menu import get_menu_order - from .button_helper import PeriodicTaskHelper diff --git a/document/__init__.py b/document/__init__.py index 8b13789..e69de29 100644 --- a/document/__init__.py +++ b/document/__init__.py @@ -1 +0,0 @@ - diff --git a/document/management/__init__.py b/document/management/__init__.py index 8b13789..e69de29 100644 --- a/document/management/__init__.py +++ b/document/management/__init__.py @@ -1 +0,0 @@ - diff --git a/document/management/commands/__init__.py b/document/management/commands/__init__.py index 8b13789..e69de29 100644 --- a/document/management/commands/__init__.py +++ b/document/management/commands/__init__.py @@ -1 +0,0 @@ - diff --git a/document/management/commands/load_articles_by_year.py b/document/management/commands/load_articles_by_year.py index a922456..4b7e078 100644 --- a/document/management/commands/load_articles_by_year.py +++ b/document/management/commands/load_articles_by_year.py @@ -1,7 +1,7 @@ from django.core.management.base import BaseCommand -from document.tasks import task_load_documents_from_article_meta -from document.tasks import task_load_documents_from_opac +from document.tasks.articlemeta import task_load_documents_from_article_meta +from document.tasks.opac import task_load_documents_from_opac class Command(BaseCommand): diff --git a/document/migrations/__init__.py b/document/migrations/__init__.py index 8b13789..e69de29 100644 --- a/document/migrations/__init__.py +++ b/document/migrations/__init__.py @@ -1 +0,0 @@ - diff --git a/document/models.py b/document/models.py index 5197692..d78968b 100644 --- a/document/models.py +++ b/document/models.py @@ -174,6 +174,52 @@ class Document(CommonControlField): def __str__(self): return f"{self.collection.acron3} - {self.document_type} - {self.document_id}" + @classmethod + def build_book_pid_generic(cls, book_id): + if book_id in (None, ""): + return None + return f"book:{book_id}" + + @classmethod + def build_chapter_pid_generic(cls, book_id, chapter_id): + if book_id in (None, "") or chapter_id in (None, ""): + return None + return f"book:{book_id}/chapter:{chapter_id}" + + @classmethod + def find_by_identifiers(cls, collection, document_type, *identifiers): + identifiers = [str(value) for value in identifiers if value not in (None, "")] + if not identifiers: + return None + + queryset = cls.objects.filter( + collection=collection, + document_type=document_type, + ) + + for field_name in ("document_id", "pid_v2", "pid_v3", "pid_generic"): + for identifier in identifiers: + document = queryset.filter(**{field_name: identifier}).first() + if document: + return document + + return None + + @classmethod + def book_exists_for_raw_id(cls, collection, raw_id): + return cls.objects.filter( + collection=collection, + document_type=cls.DOCUMENT_TYPE_BOOK, + extra_data__raw_id=str(raw_id), + ).exists() + + @classmethod + def delete_documents_by_raw_id(cls, collection, raw_id): + return cls.objects.filter( + collection=collection, + extra_data__raw_id=str(raw_id), + ).delete() + @classmethod def metadata(cls, collection=None): queryset = cls.objects.select_related("collection", "source").only( @@ -215,7 +261,9 @@ def metadata(cls, collection=None): "files": document.files or {}, "identifiers": document.identifiers or {}, "parent_document_id": ( - document.parent_document.document_id if document.parent_document else None + document.parent_document.document_id + if document.parent_document + else None ), "pid_generic": document.pid_generic, "pid_v2": document.pid_v2, @@ -223,7 +271,8 @@ def metadata(cls, collection=None): "processing_date": document.processing_date, "publication_date": document.publication_date, "publication_year": document.publication_year, - "scielo_issn": document.scielo_issn or (source.scielo_issn if source else None), + "scielo_issn": document.scielo_issn + or (source.scielo_issn if source else None), "source_id": source.source_id if source else None, "source_type": source.source_type if source else None, "text_langs": document.text_langs or [], diff --git a/document/services/__init__.py b/document/services/__init__.py index 8b13789..e69de29 100644 --- a/document/services/__init__.py +++ b/document/services/__init__.py @@ -1 +0,0 @@ - diff --git a/document/services/articles.py b/document/services/article.py similarity index 82% rename from document/services/articles.py rename to document/services/article.py index 09244b3..c6f6d42 100644 --- a/document/services/articles.py +++ b/document/services/article.py @@ -1,6 +1,6 @@ from document.models import Document -from .common import build_document_id, compact_dict, get_existing_document, normalize_langs, normalize_year +from core.utils.metadata import compact_dict, normalize_langs, normalize_year def upsert_article_document_from_articlemeta( @@ -11,11 +11,13 @@ def upsert_article_document_from_articlemeta( force_update=True, ): pid_v2 = payload.get("code") - document_id = build_document_id(pid_v2, payload.get("pid_v3"), payload.get("pid_generic")) + document_id = _first_identifier( + pid_v2, payload.get("pid_v3"), payload.get("pid_generic") + ) if not document_id: return None - document = get_existing_document( + document = Document.find_by_identifiers( collection, Document.DOCUMENT_TYPE_ARTICLE, document_id, @@ -47,8 +49,12 @@ def upsert_article_document_from_articlemeta( document.default_lang = payload.get("default_language") or document.default_lang document.text_langs = normalize_langs(payload.get("text_langs")) document.default_media_format = document.default_media_format - document.processing_date = payload.get("processing_date") or document.processing_date - document.publication_date = payload.get("publication_date") or document.publication_date + document.processing_date = ( + payload.get("processing_date") or document.processing_date + ) + document.publication_date = ( + payload.get("publication_date") or document.publication_date + ) document.publication_year = normalize_year( payload.get("publication_year"), fallback_date=document.publication_date, @@ -79,11 +85,15 @@ def upsert_article_document_from_opac( ): pid_v2 = payload.get("pid_v2") pid_v3 = payload.get("pid_v3") - document_id = build_document_id(pid_v2, pid_v3, payload.get("pid_generic")) + document_id = _first_identifier( + pid_v2, + pid_v3, + payload.get("pid_generic"), + ) if not document_id: return None - document = get_existing_document( + document = Document.find_by_identifiers( collection, Document.DOCUMENT_TYPE_ARTICLE, document_id, @@ -115,10 +125,14 @@ def upsert_article_document_from_opac( ) document.files = document.files or {} document.default_lang = payload.get("default_language") or document.default_lang - document.text_langs = normalize_langs(payload.get("text_langs")) or document.text_langs or [] + document.text_langs = ( + normalize_langs(payload.get("text_langs")) or document.text_langs or [] + ) document.default_media_format = document.default_media_format document.processing_date = document.processing_date - document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_date = ( + payload.get("publication_date") or document.publication_date + ) document.publication_year = normalize_year( payload.get("publication_year"), fallback_date=document.publication_date, @@ -164,3 +178,10 @@ def _merge_dicts(current, new_values): merged = dict(current or {}) merged.update(new_values or {}) return merged + + +def _first_identifier(*values): + for value in values: + if value not in (None, ""): + return str(value) + return None diff --git a/document/services/books.py b/document/services/book.py similarity index 71% rename from document/services/books.py rename to document/services/book.py index 96d92e1..3b5a86a 100644 --- a/document/services/books.py +++ b/document/services/book.py @@ -1,16 +1,6 @@ from document.models import Document - -def build_book_pid_generic(book_id): - if book_id in (None, ""): - return None - return f"book:{book_id}" - - -def build_chapter_pid_generic(book_id, chapter_id): - if book_id in (None, "") or chapter_id in (None, ""): - return None - return f"book:{book_id}/chapter:{chapter_id}" +from core.utils.metadata import compact_dict, normalize_langs, normalize_year def enrich_part_payload(payload, monograph_payload): @@ -43,7 +33,7 @@ def upsert_monograph_document( return None book_id = str(payload.get("id")) - pid_generic = build_book_pid_generic(book_id) + pid_generic = Document.build_book_pid_generic(book_id) document, created = Document.objects.get_or_create( collection=collection, document_type=Document.DOCUMENT_TYPE_BOOK, @@ -64,11 +54,11 @@ def upsert_monograph_document( document.identifiers = _build_monograph_identifiers(payload) document.files = {} document.default_lang = payload.get("language") or None - document.text_langs = _unique_list(payload.get("language")) + document.text_langs = normalize_langs(payload.get("language")) document.default_media_format = None document.processing_date = None document.publication_date = payload.get("publication_date") or None - document.publication_year = _normalize_year(payload.get("year")) + document.publication_year = normalize_year(payload.get("year")) document.extra_data = _build_monograph_extra_data( payload, source_url=source_url, @@ -97,7 +87,7 @@ def upsert_part_document( book_id = payload.get("monograph") chapter_id = payload.get("id") - pid_generic = build_chapter_pid_generic(book_id, chapter_id) + pid_generic = Document.build_chapter_pid_generic(book_id, chapter_id) document, created = Document.objects.get_or_create( collection=collection, document_type=Document.DOCUMENT_TYPE_CHAPTER, @@ -118,17 +108,15 @@ def upsert_part_document( document.identifiers = _build_part_identifiers(payload) document.files = {} document.default_lang = ( - payload.get("text_language") - or payload.get("monograph_language") - or None + payload.get("text_language") or payload.get("monograph_language") or None ) - document.text_langs = _unique_list( + document.text_langs = normalize_langs( payload.get("text_language") or payload.get("monograph_language") ) document.default_media_format = None document.processing_date = None document.publication_date = payload.get("monograph_publication_date") or None - document.publication_year = _normalize_year(payload.get("monograph_year")) + document.publication_year = normalize_year(payload.get("monograph_year")) document.extra_data = _build_part_extra_data( payload, source_url=source_url, @@ -142,37 +130,6 @@ def upsert_part_document( return document -def delete_book_document(collection, book_id): - return Document.objects.filter( - collection=collection, - document_type=Document.DOCUMENT_TYPE_BOOK, - document_id=build_book_pid_generic(book_id), - ).delete() - - -def delete_document_by_raw_id(collection, raw_id): - return Document.objects.filter( - collection=collection, - extra_data__raw_id=str(raw_id), - ).delete() - - -def has_monograph_document_for_raw_id(collection, raw_id): - return Document.objects.filter( - collection=collection, - document_type=Document.DOCUMENT_TYPE_BOOK, - extra_data__raw_id=str(raw_id), - ).exists() - - -def get_monograph_document(collection, book_id): - return Document.objects.filter( - collection=collection, - document_type=Document.DOCUMENT_TYPE_BOOK, - document_id=build_book_pid_generic(book_id), - ).first() - - def _build_monograph_identifiers(payload): identifiers = { "book_id": str(payload.get("id")) if payload.get("id") is not None else None, @@ -180,19 +137,21 @@ def _build_monograph_identifiers(payload): "eisbn": payload.get("eisbn"), "doi": payload.get("doi_number"), } - return _compact_dict(identifiers) + return compact_dict(identifiers) def _build_part_identifiers(payload): identifiers = { - "book_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None, + "book_id": str(payload.get("monograph")) + if payload.get("monograph") is not None + else None, "chapter_id": str(payload.get("id")) if payload.get("id") is not None else None, "isbn": payload.get("monograph_isbn"), "eisbn": payload.get("monograph_eisbn"), "doi": payload.get("doi_number"), "book_doi": payload.get("monograph_doi_number"), } - return _compact_dict(identifiers) + return compact_dict(identifiers) def _build_monograph_extra_data(payload, source_url=None, last_seq=None): @@ -211,7 +170,7 @@ def _build_monograph_extra_data(payload, source_url=None, last_seq=None): "translated_synopses": payload.get("translated_synopses"), "synopsis": payload.get("synopsis"), } - return _compact_dict(extra_data) + return compact_dict(extra_data) def _build_part_extra_data(payload, source_url=None, last_seq=None): @@ -225,7 +184,9 @@ def _build_part_extra_data(payload, source_url=None, last_seq=None): "pages": payload.get("pages"), "creators": payload.get("creators"), "translated_titles": payload.get("translated_titles"), - "monograph_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None, + "monograph_id": str(payload.get("monograph")) + if payload.get("monograph") is not None + else None, "monograph_title": payload.get("monograph_title"), "monograph_language": payload.get("monograph_language"), "monograph_publication_date": payload.get("monograph_publication_date"), @@ -233,24 +194,4 @@ def _build_part_extra_data(payload, source_url=None, last_seq=None): "monograph_publisher": payload.get("monograph_publisher"), "monograph_creators": payload.get("monograph_creators"), } - return _compact_dict(extra_data) - - -def _unique_list(value): - if not value: - return [] - return [value] - - -def _normalize_year(value): - if value in (None, ""): - return None - return str(value)[:4] - - -def _compact_dict(data): - return { - key: value - for key, value in data.items() - if value not in (None, "", [], {}, ()) - } + return compact_dict(extra_data) diff --git a/document/services/common.py b/document/services/common.py deleted file mode 100644 index 91e103d..0000000 --- a/document/services/common.py +++ /dev/null @@ -1,58 +0,0 @@ -from document.models import Document - - -def build_document_id(*values): - for value in values: - if value not in (None, ""): - return str(value) - return None - - -def get_existing_document(collection, document_type, *identifiers): - identifiers = [str(value) for value in identifiers if value not in (None, "")] - if not identifiers: - return None - - queryset = Document.objects.filter( - collection=collection, - document_type=document_type, - ) - - for field_name in ("document_id", "pid_v2", "pid_v3", "pid_generic"): - for identifier in identifiers: - document = queryset.filter(**{field_name: identifier}).first() - if document: - return document - - return None - - -def normalize_langs(value): - if not value: - return [] - - if isinstance(value, list): - return [item for item in value if item not in (None, "")] - - if isinstance(value, dict): - return [key for key, enabled in value.items() if enabled] - - return [value] - - -def normalize_year(value, fallback_date=None): - if value not in (None, ""): - return str(value)[:4] - - if fallback_date not in (None, ""): - return str(fallback_date)[:4] - - return None - - -def compact_dict(data): - return { - key: value - for key, value in data.items() - if value not in (None, "", [], {}, ()) - } diff --git a/document/services/datasets.py b/document/services/dataset.py similarity index 91% rename from document/services/datasets.py rename to document/services/dataset.py index 2496b20..c6f5bb5 100644 --- a/document/services/datasets.py +++ b/document/services/dataset.py @@ -1,6 +1,6 @@ from document.models import Document -from .common import compact_dict, normalize_year +from core.utils.metadata import compact_dict, normalize_year def upsert_dataset_document( @@ -51,7 +51,9 @@ def upsert_dataset_document( document.text_langs = document.text_langs or [] document.default_media_format = document.default_media_format document.processing_date = document.processing_date - document.publication_date = payload.get("dataset_published") or document.publication_date + document.publication_date = ( + payload.get("dataset_published") or document.publication_date + ) document.publication_year = normalize_year( None, fallback_date=document.publication_date, diff --git a/document/services/preprints.py b/document/services/preprint.py similarity index 89% rename from document/services/preprints.py rename to document/services/preprint.py index 4be89f1..cfcca48 100644 --- a/document/services/preprints.py +++ b/document/services/preprint.py @@ -1,6 +1,6 @@ from document.models import Document -from .common import compact_dict, normalize_langs, normalize_year +from core.utils.metadata import compact_dict, normalize_langs, normalize_year def upsert_preprint_document( @@ -40,7 +40,9 @@ def upsert_preprint_document( document.text_langs = normalize_langs(payload.get("text_langs")) document.default_media_format = document.default_media_format document.processing_date = document.processing_date - document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_date = ( + payload.get("publication_date") or document.publication_date + ) document.publication_year = normalize_year( payload.get("publication_year"), fallback_date=document.publication_date, diff --git a/document/tasks/__init__.py b/document/tasks/__init__.py index 95a0ba5..e69de29 100644 --- a/document/tasks/__init__.py +++ b/document/tasks/__init__.py @@ -1,28 +0,0 @@ -from .articlemeta import ( - load_documents_from_article_meta, - task_load_documents_from_article_meta, -) -from .common import ( - get_latest_scielo_books_last_seq, -) -from .dataverse import ( - load_dataset_metadata_from_dataverse, - task_load_dataset_metadata_into_documents, -) -from .opac import ( - load_documents_from_opac, - task_load_documents_from_opac, -) -from .pipeline import ( - task_daily_metadata_sync_pipeline, -) -from .preprints import ( - load_preprints_from_preprints_api, - task_load_preprints_into_documents, -) -from .scielo_books import ( - load_documents_from_scielo_books, - sync_documents_from_scielo_books, - task_load_documents_from_scielo_books, - task_sync_documents_from_scielo_books, -) diff --git a/document/tasks/articlemeta.py b/document/tasks/articlemeta.py index 75b2689..6fbd0b4 100644 --- a/document/tasks/articlemeta.py +++ b/document/tasks/articlemeta.py @@ -3,15 +3,14 @@ from django.db import DataError from django.utils.translation import gettext as _ +from config import celery_app from core.collectors import articlemeta as articlemeta_collector from core.utils import date_utils from core.utils.request_utils import _get_user -from document.services import articles as article_service -from source.services import journals as journal_service - -from config import celery_app +from document.services import article as article_service +from source.models import Source -from .common import _get_collection +from document.tasks.common import _get_collection def load_documents_from_article_meta( @@ -60,7 +59,7 @@ def load_documents_from_article_meta( ) continue - source = journal_service.find_journal_source_by_issns( + source = Source.find_journal_by_issns( collection_obj, payload.get("code_title"), ) @@ -86,8 +85,8 @@ def load_documents_from_article_meta( "Collection: %s, Source: %s, PIDv2: %s. Error: %s", collection_obj, source.source_id, - payload.get('code'), - exc + payload.get("code"), + exc, ) continue @@ -96,7 +95,12 @@ def load_documents_from_article_meta( return True -@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Article Meta)"), timelimit=-1, queue="load") +@celery_app.task( + bind=True, + name=_("[Metadata] Sync Documents (Article Meta)"), + timelimit=-1, + queue="load", +) def task_load_documents_from_article_meta( self, from_date=None, diff --git a/document/tasks/dataverse.py b/document/tasks/dataverse.py index 15618a5..43d74de 100644 --- a/document/tasks/dataverse.py +++ b/document/tasks/dataverse.py @@ -3,14 +3,13 @@ from django.db import DataError from django.utils.translation import gettext as _ +from config import celery_app from core.collectors import dataverse as dataverse_collector from core.utils import date_utils from core.utils.request_utils import _get_user -from document.services import datasets as dataset_service - -from config import celery_app +from document.services import dataset as dataset_service -from .common import _get_collection +from document.tasks.common import _get_collection def load_dataset_metadata_from_dataverse( @@ -52,15 +51,20 @@ def load_dataset_metadata_from_dataverse( logging.error( "Error saving Dataset Document. Collection: %s, PID: %s. Error: %s", collection_obj, - payload.get('dataset_doi'), - exc + payload.get("dataset_doi"), + exc, ) continue return True -@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Dataverse)"), timelimit=-1, queue="load") +@celery_app.task( + bind=True, + name=_("[Metadata] Sync Documents (Dataverse)"), + timelimit=-1, + queue="load", +) def task_load_dataset_metadata_into_documents( self, from_date=None, diff --git a/document/tasks/opac.py b/document/tasks/opac.py index 5e1c81e..3256a73 100644 --- a/document/tasks/opac.py +++ b/document/tasks/opac.py @@ -3,15 +3,14 @@ from django.db import DataError from django.utils.translation import gettext as _ +from config import celery_app from core.collectors import opac as opac_collector from core.utils import date_utils from core.utils.request_utils import _get_user -from document.services import articles as article_service -from source.services import journals as journal_service - -from config import celery_app +from document.services import article as article_service +from source.models import Source -from .common import _get_collection +from document.tasks.common import _get_collection def load_documents_from_opac( @@ -45,7 +44,7 @@ def load_documents_from_opac( documents = response.get("documents") or {} for payload in documents.values(): - source = journal_service.find_journal_source_by_acronym( + source = Source.find_journal_by_acronym( collection_obj, payload.get("journal_acronym"), ) @@ -71,8 +70,8 @@ def load_documents_from_opac( "Collection: %s, Source: %s, PIDv2: %s. Error: %s", collection_obj, source.source_id, - payload.get('pid_v2'), - exc + payload.get("pid_v2"), + exc, ) continue @@ -83,7 +82,9 @@ def load_documents_from_opac( return True -@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (OPAC)"), timelimit=-1, queue="load") +@celery_app.task( + bind=True, name=_("[Metadata] Sync Documents (OPAC)"), timelimit=-1, queue="load" +) def task_load_documents_from_opac( self, collection="scl", diff --git a/document/tasks/pipeline.py b/document/tasks/pipeline.py index 97bef7c..1073aa8 100644 --- a/document/tasks/pipeline.py +++ b/document/tasks/pipeline.py @@ -5,20 +5,24 @@ from config import celery_app -from .articlemeta import task_load_documents_from_article_meta -from .dataverse import task_load_dataset_metadata_into_documents -from .opac import task_load_documents_from_opac -from .preprints import task_load_preprints_into_documents -from .scielo_books import task_sync_documents_from_scielo_books +from document.tasks.articlemeta import task_load_documents_from_article_meta +from document.tasks.dataverse import task_load_dataset_metadata_into_documents +from document.tasks.opac import task_load_documents_from_opac +from document.tasks.preprints import task_load_preprints_into_documents +from document.tasks.scielo_books import task_sync_documents_from_scielo_books -@celery_app.task(bind=True, name=_("[Metadata] Daily Sync Routine (Auto)"), queue="load") +@celery_app.task( + bind=True, name=_("[Metadata] Daily Sync Routine (Auto)"), queue="load" +) def task_daily_metadata_sync_pipeline(self): logging.info("Starting Daily Metadata Sync Pipeline") - group([ - task_load_documents_from_article_meta.s(), - task_load_documents_from_opac.s(), - task_load_preprints_into_documents.s(), - task_load_dataset_metadata_into_documents.s(), - task_sync_documents_from_scielo_books.s(), - ]).apply_async() + group( + [ + task_load_documents_from_article_meta.s(), + task_load_documents_from_opac.s(), + task_load_preprints_into_documents.s(), + task_load_dataset_metadata_into_documents.s(), + task_sync_documents_from_scielo_books.s(), + ] + ).apply_async() diff --git a/document/tasks/preprints.py b/document/tasks/preprints.py index ee63211..1f2d2e2 100644 --- a/document/tasks/preprints.py +++ b/document/tasks/preprints.py @@ -3,14 +3,13 @@ from django.db import DataError from django.utils.translation import gettext as _ +from config import celery_app from core.collectors import preprints as preprints_collector from core.utils import date_utils from core.utils.request_utils import _get_user -from document.services import preprints as preprint_service - -from config import celery_app +from document.services import preprint as preprint_service -from .common import _get_collection +from document.tasks.common import _get_collection def load_preprints_from_preprints_api( @@ -54,15 +53,20 @@ def load_preprints_from_preprints_api( logging.error( "Error saving Preprint Document. Collection: %s, PID: %s. Error: %s", collection_obj, - payload.get('pid_generic'), - exc + payload.get("pid_generic"), + exc, ) continue return True -@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Preprints)"), timelimit=-1, queue="load") +@celery_app.task( + bind=True, + name=_("[Metadata] Sync Documents (Preprints)"), + timelimit=-1, + queue="load", +) def task_load_preprints_into_documents( self, from_date=None, diff --git a/document/tasks/scielo_books.py b/document/tasks/scielo_books.py index ddbd462..493d026 100644 --- a/document/tasks/scielo_books.py +++ b/document/tasks/scielo_books.py @@ -3,14 +3,16 @@ from django.conf import settings from django.utils.translation import gettext as _ +from collection.models import Collection +from config import celery_app from core.collectors import scielo_books as scielo_books_collector from core.utils.request_utils import _get_user -from document.services import books as document_books_service -from source.services import books as source_books_service - -from config import celery_app +from document.models import Document +from document.services import book as document_books_service +from source.models import Source +from source.services import book as source_books_service -from .common import get_latest_scielo_books_last_seq +from document.tasks.common import get_latest_scielo_books_last_seq def load_documents_from_scielo_books( @@ -25,7 +27,7 @@ def load_documents_from_scielo_books( ): db_name = db_name or settings.SCIELO_BOOKS_DB_NAME limit = limit or settings.SCIELO_BOOKS_LIMIT - collection_obj = source_books_service.get_books_collection(collection) + collection_obj = Collection.objects.get(acron3=collection) monograph_cache = {} logging.info( @@ -47,13 +49,13 @@ def load_documents_from_scielo_books( raw_id = change.get("id") if item["deleted"]: - delete_source = document_books_service.has_monograph_document_for_raw_id( + delete_source = Document.book_exists_for_raw_id( collection_obj, raw_id, ) - document_books_service.delete_document_by_raw_id(collection_obj, raw_id) + Document.delete_documents_by_raw_id(collection_obj, raw_id) if delete_source: - source_books_service.delete_book_source(collection_obj, raw_id) + Source.delete_book_source_by_id(collection_obj, raw_id) continue payload = item["payload"] or {} @@ -164,7 +166,9 @@ def sync_documents_from_scielo_books( ) -@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Manual)"), queue="load") +@celery_app.task( + bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Manual)"), queue="load" +) def task_load_documents_from_scielo_books( self, collection="books", @@ -192,7 +196,11 @@ def task_load_documents_from_scielo_books( ) -@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Incremental)"), queue="load") +@celery_app.task( + bind=True, + name=_("[Metadata] Sync Documents (SciELO Books - Incremental)"), + queue="load", +) def task_sync_documents_from_scielo_books( self, collection="books", @@ -218,7 +226,9 @@ def task_sync_documents_from_scielo_books( ) -def _get_monograph_payload(payload, monograph_cache, base_url=None, db_name=None, headers=None): +def _get_monograph_payload( + payload, monograph_cache, base_url=None, db_name=None, headers=None +): monograph_id = payload.get("monograph") if not monograph_id: return None diff --git a/metrics/exceptions.py b/document/tests/__init__.py similarity index 100% rename from metrics/exceptions.py rename to document/tests/__init__.py diff --git a/document/tests/test_models.py b/document/tests/test_models.py new file mode 100644 index 0000000..475e7f4 --- /dev/null +++ b/document/tests/test_models.py @@ -0,0 +1,113 @@ +from django.test import TestCase + +from collection.models import Collection +from document.models import Document +from source.models import Source + + +class DocumentIdentifierTests(TestCase): + def test_find_by_identifiers_searches_legacy_identifier_fields(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + document = Document.objects.create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id="doc-id", + pid_v2="pid-v2", + pid_v3="pid-v3", + pid_generic="pid-generic", + ) + + for identifier in ("doc-id", "pid-v2", "pid-v3", "pid-generic"): + self.assertEqual( + Document.find_by_identifiers( + collection, + Document.DOCUMENT_TYPE_ARTICLE, + identifier, + ), + document, + ) + + self.assertIsNone( + Document.find_by_identifiers( + collection, + Document.DOCUMENT_TYPE_ARTICLE, + "missing", + ) + ) + + def test_builds_book_pid_generic_values(self): + self.assertEqual(Document.build_book_pid_generic("abcd1"), "book:abcd1") + self.assertEqual( + Document.build_chapter_pid_generic("abcd1", "18"), + "book:abcd1/chapter:18", + ) + self.assertIsNone(Document.build_book_pid_generic("")) + self.assertIsNone(Document.build_chapter_pid_generic("abcd1", "")) + + def test_delete_documents_by_raw_id_deletes_collection_documents(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + other_collection = Collection.objects.create(acron3="other", acron2="ot") + Document.objects.create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:abcd1", + extra_data={"raw_id": "abcd1"}, + ) + Document.objects.create( + collection=other_collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:abcd1", + extra_data={"raw_id": "abcd1"}, + ) + + deleted_count, _ = Document.delete_documents_by_raw_id(collection, "abcd1") + + self.assertEqual(deleted_count, 1) + self.assertFalse( + Document.objects.filter(collection=collection, extra_data__raw_id="abcd1") + .exists() + ) + self.assertTrue( + Document.objects.filter( + collection=other_collection, + extra_data__raw_id="abcd1", + ).exists() + ) + + +class DocumentMetadataTests(TestCase): + def test_metadata_includes_source_context_and_legacy_identifiers(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + title="Test Journal", + identifiers={"scielo_issn": "1234-5678"}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id="S123456782024000100001", + scielo_issn="1234-5678", + pid_v2="S123456782024000100001", + pid_v3="abc123", + title="Test Article", + identifiers={"doi": "10.1590/example"}, + files={"pt": {"path": "/pdf/test.pdf"}}, + default_lang="en", + text_langs=["en", "pt"], + publication_date="2024-01-15", + publication_year="2024", + ) + + metadata = list(Document.metadata(collection=collection)) + + self.assertEqual(len(metadata), 1) + self.assertEqual(metadata[0]["document_type"], Document.DOCUMENT_TYPE_ARTICLE) + self.assertEqual(metadata[0]["document_id"], "S123456782024000100001") + self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(metadata[0]["source_id"], "1234-5678") + self.assertEqual(metadata[0]["scielo_issn"], "1234-5678") diff --git a/document/tests.py b/document/tests/test_services.py similarity index 57% rename from document/tests.py rename to document/tests/test_services.py index 14d9bcd..e30c306 100644 --- a/document/tests.py +++ b/document/tests/test_services.py @@ -1,56 +1,66 @@ from django.test import TestCase -from unittest.mock import patch from collection.models import Collection -from document import tasks as document_tasks -from source.services import books as source_books_service +from document.models import Document +from document.services import article as article_service +from document.services import book as books_service +from document.services import dataset as dataset_service +from document.services import preprint as preprint_service from source.models import Source +from source.services import book as source_books_service -from .models import Document -from .services import articles as article_service -from .services import books as books_service -from .services import datasets as dataset_service -from .services import preprints as preprint_service - -class DocumentMetadataTests(TestCase): - def test_metadata_includes_source_context_and_legacy_identifiers(self): +class ArticleServiceTests(TestCase): + def test_articlemeta_and_opac_upsert_same_document(self): collection = Collection.objects.create(acron3="scl", acron2="sc") source = Source.objects.create( collection=collection, source_type=Source.SOURCE_TYPE_JOURNAL, source_id="1234-5678", scielo_issn="1234-5678", + acronym="testjou", title="Test Journal", identifiers={"scielo_issn": "1234-5678"}, ) - Document.objects.create( + + first = article_service.upsert_article_document_from_articlemeta( + { + "code": "S123456782024000100001", + "title": "Article Title", + "pdfs": {"en": {"url": "/pdf/en.pdf"}}, + "processing_date": "2024-02-10", + "publication_date": "2024-01-15", + "publication_year": "2024", + "default_language": "en", + "text_langs": ["en", "pt"], + "code_title": ["1234-5678"], + }, + collection=collection, + source=source, + ) + second = article_service.upsert_article_document_from_opac( + { + "pid_v2": "S123456782024000100001", + "pid_v3": "S1234-56782024000100001", + "title": "Article Title", + "journal_acronym": "testjou", + "publication_date": "2024-01-15", + "default_language": "en", + "text_langs": ["en", "pt"], + }, collection=collection, source=source, - document_type=Document.DOCUMENT_TYPE_ARTICLE, - document_id="S123456782024000100001", - scielo_issn="1234-5678", - pid_v2="S123456782024000100001", - pid_v3="abc123", - title="Test Article", - identifiers={"doi": "10.1590/example"}, - files={"pt": {"path": "/pdf/test.pdf"}}, - default_lang="en", - text_langs=["en", "pt"], - publication_date="2024-01-15", - publication_year="2024", ) - metadata = list(Document.metadata(collection=collection)) + self.assertEqual(first.pk, second.pk) + self.assertEqual(Document.objects.count(), 1) + second.refresh_from_db() + self.assertEqual(second.pid_v3, "S1234-56782024000100001") + self.assertEqual(second.identifiers["journal_acronym"], "testjou") - self.assertEqual(len(metadata), 1) - self.assertEqual(metadata[0]["document_type"], Document.DOCUMENT_TYPE_ARTICLE) - self.assertEqual(metadata[0]["document_id"], "S123456782024000100001") - self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL) - self.assertEqual(metadata[0]["source_id"], "1234-5678") - self.assertEqual(metadata[0]["scielo_issn"], "1234-5678") - def test_upsert_monograph_and_part_documents_from_books_payload(self): +class BookServiceTests(TestCase): + def test_upsert_monograph_and_part_documents(self): collection = Collection.objects.create(acron3="books", acron2="bk") monograph_payload = { "TYPE": "Monograph", @@ -74,13 +84,10 @@ def test_upsert_monograph_and_part_documents_from_books_payload(self): } source = source_books_service.upsert_monograph_source( - monograph_payload, - collection=collection, + monograph_payload, collection=collection ) parent_document = books_service.upsert_monograph_document( - monograph_payload, - collection=collection, - source=source, + monograph_payload, collection=collection, source=source ) chapter = books_service.upsert_part_document( books_service.enrich_part_payload(part_payload, monograph_payload), @@ -98,53 +105,8 @@ def test_upsert_monograph_and_part_documents_from_books_payload(self): self.assertEqual(chapter.identifiers["book_id"], "abcd1") self.assertEqual(chapter.default_lang, "es") - def test_articlemeta_and_opac_upsert_same_document(self): - collection = Collection.objects.create(acron3="scl", acron2="sc") - source = Source.objects.create( - collection=collection, - source_type=Source.SOURCE_TYPE_JOURNAL, - source_id="1234-5678", - scielo_issn="1234-5678", - acronym="testjou", - title="Test Journal", - identifiers={"scielo_issn": "1234-5678"}, - ) - - first = article_service.upsert_article_document_from_articlemeta( - { - "code": "S123456782024000100001", - "title": "Article Title", - "pdfs": {"en": {"url": "/pdf/en.pdf"}}, - "processing_date": "2024-02-10", - "publication_date": "2024-01-15", - "publication_year": "2024", - "default_language": "en", - "text_langs": ["en", "pt"], - "code_title": ["1234-5678"], - }, - collection=collection, - source=source, - ) - second = article_service.upsert_article_document_from_opac( - { - "pid_v2": "S123456782024000100001", - "pid_v3": "S1234-56782024000100001", - "title": "Article Title", - "journal_acronym": "testjou", - "publication_date": "2024-01-15", - "default_language": "en", - "text_langs": ["en", "pt"], - }, - collection=collection, - source=source, - ) - - self.assertEqual(first.pk, second.pk) - self.assertEqual(Document.objects.count(), 1) - second.refresh_from_db() - self.assertEqual(second.pid_v3, "S1234-56782024000100001") - self.assertEqual(second.identifiers["journal_acronym"], "testjou") +class PreprintServiceTests(TestCase): def test_upsert_preprint_document_maps_metadata(self): collection = Collection.objects.create(acron3="preprints", acron2="pp") @@ -165,6 +127,8 @@ def test_upsert_preprint_document_maps_metadata(self): self.assertEqual(document.pid_generic, "preprint/123") self.assertEqual(document.default_lang, "en") + +class DatasetServiceTests(TestCase): def test_upsert_dataset_document_accumulates_files(self): collection = Collection.objects.create(acron3="data", acron2="dt") @@ -196,60 +160,3 @@ def test_upsert_dataset_document_accumulates_files(self): self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_DATASET) self.assertEqual(document.document_id, "10.1234/dataset") self.assertEqual(set(document.files.keys()), {"1", "2"}) - - -class DocumentBooksSyncTests(TestCase): - def test_get_latest_scielo_books_last_seq_uses_documents_and_sources(self): - collection = Collection.objects.create(acron3="books", acron2="bk") - source = Source.objects.create( - collection=collection, - source_type=Source.SOURCE_TYPE_BOOK, - source_id="book-1", - title="Book 1", - extra_data={"last_seq": 120}, - ) - Document.objects.create( - collection=collection, - source=source, - document_type=Document.DOCUMENT_TYPE_BOOK, - document_id="book:book-1", - extra_data={"last_seq": "135"}, - ) - - self.assertEqual(document_tasks.get_latest_scielo_books_last_seq("books"), 135) - - def test_sync_documents_from_scielo_books_uses_computed_since(self): - collection = Collection.objects.create(acron3="books", acron2="bk") - source = Source.objects.create( - collection=collection, - source_type=Source.SOURCE_TYPE_BOOK, - source_id="book-1", - title="Book 1", - extra_data={"last_seq": 120}, - ) - Document.objects.create( - collection=collection, - source=source, - document_type=Document.DOCUMENT_TYPE_BOOK, - document_id="book:book-1", - extra_data={"last_seq": 135}, - ) - - with patch("document.tasks.scielo_books.load_documents_from_scielo_books", return_value=True) as mocked: - result = document_tasks.sync_documents_from_scielo_books( - collection="books", - db_name="scielobooks_1a", - limit=500, - ) - - self.assertTrue(result) - mocked.assert_called_once_with( - collection="books", - db_name="scielobooks_1a", - since=135, - limit=500, - force_update=True, - headers=None, - base_url=None, - user=None, - ) diff --git a/document/tests/test_tasks.py b/document/tests/test_tasks.py new file mode 100644 index 0000000..9f3a9a1 --- /dev/null +++ b/document/tests/test_tasks.py @@ -0,0 +1,72 @@ +from unittest.mock import patch + +from django.test import TestCase + +from collection.models import Collection +from document.models import Document +from document.tasks import common as document_tasks_common +from document.tasks import scielo_books as document_tasks_scielo_books +from source.models import Source + + +class DocumentBooksSyncTests(TestCase): + def test_get_latest_scielo_books_last_seq_uses_documents_and_sources(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="book-1", + title="Book 1", + extra_data={"last_seq": 120}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:book-1", + extra_data={"last_seq": "135"}, + ) + + self.assertEqual( + document_tasks_common.get_latest_scielo_books_last_seq("books"), + 135, + ) + + def test_sync_documents_from_scielo_books_uses_computed_since(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="book-1", + title="Book 1", + extra_data={"last_seq": 120}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:book-1", + extra_data={"last_seq": 135}, + ) + + with patch( + "document.tasks.scielo_books.load_documents_from_scielo_books", + return_value=True, + ) as mocked: + result = document_tasks_scielo_books.sync_documents_from_scielo_books( + collection="books", + db_name="scielobooks_1a", + limit=500, + ) + + self.assertTrue(result) + mocked.assert_called_once_with( + collection="books", + db_name="scielobooks_1a", + since=135, + limit=500, + force_update=True, + headers=None, + base_url=None, + user=None, + ) diff --git a/document/wagtail_hooks.py b/document/wagtail_hooks.py index de291c9..51ef5ad 100644 --- a/document/wagtail_hooks.py +++ b/document/wagtail_hooks.py @@ -1,7 +1,7 @@ from django.utils.translation import gettext_lazy as _ from wagtail.snippets.views.snippets import SnippetViewSet -from .models import Document +from document.models import Document class DocumentSnippetViewSet(SnippetViewSet): diff --git a/log_manager/admin.py b/log_manager/admin.py index 8c38f3f..846f6b4 100644 --- a/log_manager/admin.py +++ b/log_manager/admin.py @@ -1,3 +1 @@ -from django.contrib import admin - # Register your models here. diff --git a/log_manager/choices.py b/log_manager/choices.py index c6e461a..aa46a54 100644 --- a/log_manager/choices.py +++ b/log_manager/choices.py @@ -1,13 +1,12 @@ from django.utils.translation import gettext_lazy as _ - -LOG_FILE_STATUS_CREATED = 'CRE' -LOG_FILE_STATUS_QUEUED = 'QUE' -LOG_FILE_STATUS_PARSING = 'PAR' -LOG_FILE_STATUS_PROCESSED = 'PRO' -LOG_FILE_STATUS_ERROR = 'ERR' -LOG_FILE_STATUS_INVALIDATED = 'INV' -LOG_FILE_STATUS_IGNORED = 'IGN' +LOG_FILE_STATUS_CREATED = "CRE" +LOG_FILE_STATUS_QUEUED = "QUE" +LOG_FILE_STATUS_PARSING = "PAR" +LOG_FILE_STATUS_PROCESSED = "PRO" +LOG_FILE_STATUS_ERROR = "ERR" +LOG_FILE_STATUS_INVALIDATED = "INV" +LOG_FILE_STATUS_IGNORED = "IGN" LOG_FILE_STATUS = [ (LOG_FILE_STATUS_CREATED, _("Created")), @@ -18,4 +17,3 @@ (LOG_FILE_STATUS_INVALIDATED, _("Invalidated")), (LOG_FILE_STATUS_IGNORED, _("Ignored")), ] - diff --git a/log_manager/exceptions.py b/log_manager/exceptions.py index 27d38e0..12feaa2 100644 --- a/log_manager/exceptions.py +++ b/log_manager/exceptions.py @@ -1,20 +1,26 @@ class LogFileAlreadyExistsError(Exception): ... + class InvalidDateFormatError(Exception): ... + class InvalidTemporaReferenceError(Exception): ... + class UndefinedApplicationConfigError(Exception): ... + class UndefinedCollectionConfigError(Exception): ... + class MultipleCollectionConfigError(Exception): ... + class UnsupportedFileExtentionError(Exception): - ... \ No newline at end of file + ... diff --git a/log_manager/management/__init__.py b/log_manager/management/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/log_manager/management/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/log_manager/management/commands/__init__.py b/log_manager/management/commands/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/log_manager/management/commands/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/log_manager/management/commands/reset_log_catalog.py b/log_manager/management/commands/reset_log_catalog.py deleted file mode 100644 index 5ded576..0000000 --- a/log_manager/management/commands/reset_log_catalog.py +++ /dev/null @@ -1,94 +0,0 @@ -from django.core.management.base import BaseCommand -from django.db import transaction - -from log_manager.models import LogFile -from metrics.models import DailyMetricJob -from metrics.services import daily_payloads -from reports.models import MonthlyLogReport, WeeklyLogReport, YearlyLogReport -from tracker.models import LogFileDiscardedLine - - -class Command(BaseCommand): - help = ( - "Clear the log catalog stored in the database, including derived parsing " - "records, daily metric payloads, and optionally reports, " - "while preserving the source log files on disk." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--collection", - action="append", - dest="collections", - help="Collection acronym to limit cleanup. Repeat the option for multiple collections.", - ) - parser.add_argument( - "--reports", - action="store_true", - default=False, - help="Also clear Weekly/Monthly/Yearly log reports for the selected collections.", - ) - - def handle(self, *args, **options): - collections = options.get("collections") or [] - clear_reports = options.get("reports") - - log_files = LogFile.objects.all() - if collections: - log_files = log_files.filter(collection__acron3__in=collections) - - log_file_ids = list(log_files.values_list("id", flat=True)) - if not log_file_ids: - self.stdout.write(self.style.WARNING("No log catalog rows found for cleanup.")) - return - - daily_jobs = DailyMetricJob.objects.all() - if collections: - daily_jobs = daily_jobs.filter(collection__acron3__in=collections) - payload_paths = list(daily_jobs.exclude(storage_path="").values_list("storage_path", flat=True)) - - summary = { - "log_files": len(log_file_ids), - "discarded_lines": LogFileDiscardedLine.objects.filter( - log_file_id__in=log_file_ids - ).count(), - "daily_metric_jobs": daily_jobs.count(), - } - - for storage_path in payload_paths: - daily_payloads.delete_payload(storage_path) - - with transaction.atomic(): - LogFileDiscardedLine.objects.filter(log_file_id__in=log_file_ids).delete() - daily_jobs.delete() - LogFile.objects.filter(id__in=log_file_ids).delete() - - if clear_reports: - report_qs = WeeklyLogReport.objects.all() - m_qs = MonthlyLogReport.objects.all() - y_qs = YearlyLogReport.objects.all() - if collections: - report_qs = report_qs.filter(collection__acron3__in=collections) - m_qs = m_qs.filter(collection__acron3__in=collections) - y_qs = y_qs.filter(collection__acron3__in=collections) - summary["weekly_reports"] = report_qs.count() - summary["monthly_reports"] = m_qs.count() - summary["yearly_reports"] = y_qs.count() - report_qs.delete() - m_qs.delete() - y_qs.delete() - - msg = ( - f"Cleared log catalog: " - f"{summary['log_files']} log files, " - f"{summary['discarded_lines']} discarded lines, " - f"{summary['daily_metric_jobs']} daily metric jobs." - ) - if clear_reports: - msg += ( - f" Also cleared reports: " - f"{summary['weekly_reports']} weekly, " - f"{summary['monthly_reports']} monthly, " - f"{summary['yearly_reports']} yearly." - ) - self.stdout.write(self.style.SUCCESS(msg)) diff --git a/log_manager/models.py b/log_manager/models.py index 6bf04d8..c6d9895 100644 --- a/log_manager/models.py +++ b/log_manager/models.py @@ -7,31 +7,36 @@ from wagtailautocomplete.edit_handlers import AutocompletePanel from collection.models import Collection +from core.utils.date_utils import get_date_obj -from . import choices +from log_manager import choices class LogFile(models.Model): created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) - date = models.DateField(verbose_name=_("Date"), null=True, blank=True, db_index=True) - hash = models.CharField(_("Hash MD5"), max_length=32, null=True, blank=True, unique=True) + date = models.DateField( + verbose_name=_("Date"), null=True, blank=True, db_index=True + ) + hash = models.CharField( + _("Hash MD5"), max_length=32, null=True, blank=True, unique=True + ) path = models.CharField(_("Name"), max_length=255, null=False, blank=False) stat_result = models.JSONField(_("OS Stat Result"), null=False, blank=False) status = models.CharField( - _("Status"), - choices=choices.LOG_FILE_STATUS, - max_length=3, - blank=False, + _("Status"), + choices=choices.LOG_FILE_STATUS, + max_length=3, + blank=False, null=False, ) validation = models.JSONField( - _("Validation"), - null=True, + _("Validation"), + null=True, blank=True, default=dict, ) @@ -64,26 +69,22 @@ class LogFile(models.Model): ) panels = [ - FieldPanel('hash'), - FieldPanel('date'), - FieldPanel('path'), - FieldPanel('stat_result'), - FieldPanel('status'), - FieldPanel('validation'), - FieldPanel('summary'), - FieldPanel('last_processed_line'), - FieldPanel('parse_heartbeat_at'), - AutocompletePanel('collection'), + FieldPanel("hash"), + FieldPanel("date"), + FieldPanel("path"), + FieldPanel("stat_result"), + FieldPanel("status"), + FieldPanel("validation"), + FieldPanel("summary"), + FieldPanel("last_processed_line"), + FieldPanel("parse_heartbeat_at"), + AutocompletePanel("collection"), ] class Meta: verbose_name = _("Log File") verbose_name_plural = _("Log Files") - @classmethod - def get(cls, hash): - return cls.objects.get(hash=hash) - @classmethod def create_or_update(cls, collection, path, stat_result, hash, status=None): try: @@ -97,17 +98,73 @@ def create_or_update(cls, collection, path, stat_result, hash, status=None): }, ) except IntegrityError: - obj = cls.get(hash=hash) + obj = cls.objects.get(hash=hash) created = False if created: - logging.info(f'File {path} added to the database.') + logging.info(f"File {path} added to the database.") else: obj.updated = timezone.now() obj.save(update_fields=["updated"]) - logging.info(f'File {path} already exists in the database.') + logging.info(f"File {path} already exists in the database.") return obj - + + @classmethod + def for_collection_date(cls, collection, access_date, status_filters=None): + queryset = ( + cls.objects.filter( + collection=collection, + date=access_date, + ) + .select_related("collection") + .order_by("path", "hash") + ) + if status_filters: + queryset = queryset.filter(status__in=status_filters) + + return list(queryset) + + @classmethod + def for_collection_date_hashes(cls, collection, access_date, log_hashes): + return list( + cls.objects.filter( + collection=collection, + date=access_date, + hash__in=log_hashes, + ) + .select_related("collection") + .order_by("path", "hash") + ) + + @classmethod + def distinct_access_dates_for_parsing( + cls, + collection, + from_date, + until_date, + status_filters, + skip_hashes=None, + ): + date_queryset = ( + cls.objects.filter( + status__in=status_filters, + collection=collection, + date__gte=from_date, + date__lte=until_date, + ) + .exclude(hash__in=skip_hashes or []) + .values_list("date", flat=True) + .distinct() + .order_by("date") + ) + + access_dates = set() + for value in list(date_queryset): + access_date = value if hasattr(value, "isoformat") else get_date_obj(value) + if access_date and from_date <= access_date <= until_date: + access_dates.add(access_date) + return sorted(access_dates) + def __str__(self): - return f'{self.path}' + return f"{self.path}" diff --git a/log_manager/services/__init__.py b/log_manager/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log_manager/services/catalog.py b/log_manager/services/catalog.py new file mode 100644 index 0000000..fad59b3 --- /dev/null +++ b/log_manager/services/catalog.py @@ -0,0 +1,80 @@ +import logging +import os + +from django.conf import settings + +from collection.models import Collection +from core.utils import date_utils +from log_manager import models, utils +from log_manager_config import models as lmc_models + + +def catalog_log_files_from_configured_directories( + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, +): + from_date_str, until_date_str = date_utils.get_date_range_str( + from_date, until_date, days_to_go_back + ) + visible_dates = date_utils.get_date_objs_from_date_range( + from_date_str, until_date_str + ) + supported_extensions = settings.SUPPORTED_LOGFILE_EXTENSIONS + if not supported_extensions: + logging.error("No supported log file extensions configured.") + + for collection_code in collections or Collection.acron3_list(): + collection = Collection.objects.get(acron3=collection_code) + directories = lmc_models.CollectionLogDirectory.objects.filter( + config__collection__acron3=collection_code, + active=True, + ) + if not directories: + logging.error( + "No CollectionLogDirectory found for collection %s.", collection_code + ) + + for directory in directories: + _catalog_log_files_in_directory( + collection=collection, + directory_path=directory.path, + visible_dates=visible_dates, + supported_extensions=supported_extensions, + ) + + +def _catalog_log_files_in_directory( + collection, + directory_path, + visible_dates, + supported_extensions, +): + for root, _sub_dirs, files in os.walk(directory_path): + for name in files: + _name, extension = os.path.splitext(name) + if extension.lower() not in supported_extensions: + continue + + file_path = os.path.join(root, name) + file_stat = os.stat(file_path) + file_ctime = date_utils.get_date_obj_from_timestamp(file_stat.st_ctime) + + logging.debug("Checking file %s with ctime %s.", file_path, file_ctime) + if file_ctime not in visible_dates: + continue + + try: + models.LogFile.create_or_update( + collection=collection, + path=file_path, + stat_result=file_stat, + hash=utils.hash_file(file_path), + ) + except Exception as exc: + logging.error( + "Error cataloging file %s. Error: %s", + file_path, + exc, + ) diff --git a/log_manager/services/validation.py b/log_manager/services/validation.py new file mode 100644 index 0000000..777ac47 --- /dev/null +++ b/log_manager/services/validation.py @@ -0,0 +1,199 @@ +import logging + +from collection.models import Collection +from core.utils import date_utils +from log_manager import choices, models, utils +from log_manager_config import models as lmc_models + +LOGFILE_STAT_RESULT_CTIME_INDEX = 9 + + +def get_validation_candidate_hashes_by_collection( + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, + ignore_date=False, + revalidate=False, + status_list=None, +): + collection_codes = collections or Collection.acron3_list() + logging.info("Validating log files for collections: %s.", collection_codes) + + visible_dates = _get_validation_visible_dates( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + ignore_date=ignore_date, + ) + if visible_dates is None: + return None + + status_filter = _get_validation_status_filter( + revalidate=revalidate, + status_list=status_list, + ) + + log_hashes_by_collection = {} + for collection_code in collection_codes: + log_hashes_by_collection[collection_code] = _get_validation_candidate_hashes( + collection_code=collection_code, + status_filter=status_filter, + visible_dates=visible_dates, + ignore_date=ignore_date, + ) + + return log_hashes_by_collection + + +def _get_validation_visible_dates( + from_date=None, + until_date=None, + days_to_go_back=None, + ignore_date=False, +): + from_date_str, until_date_str = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + visible_dates = date_utils.get_date_objs_from_date_range( + from_date_str, + until_date_str, + ) + + if ignore_date: + return visible_dates + + if not visible_dates: + logging.warning("No visible dates found for log validation.") + return None + + logging.info("Interval: %s to %s.", visible_dates[0], visible_dates[-1]) + return visible_dates + + +def _get_validation_status_filter(revalidate=False, status_list=None): + status_filter = [choices.LOG_FILE_STATUS_CREATED] + + if revalidate: + status_filter += status_list or [ + choices.LOG_FILE_STATUS_QUEUED, + choices.LOG_FILE_STATUS_INVALIDATED, + choices.LOG_FILE_STATUS_ERROR, + ] + + return status_filter + + +def _get_validation_candidate_hashes( + collection_code, + status_filter, + visible_dates, + ignore_date=False, +): + log_hashes = [] + log_files = models.LogFile.objects.filter( + status__in=status_filter, + collection__acron3=collection_code, + ) + + for log_file in log_files: + if not ignore_date and not _log_file_ctime_is_in_date_range( + log_file, + visible_dates, + ): + continue + + log_hashes.append(log_file.hash) + + return log_hashes + + +def _log_file_ctime_is_in_date_range(log_file, visible_dates): + file_ctime = date_utils.get_date_obj_from_timestamp( + log_file.stat_result[LOGFILE_STAT_RESULT_CTIME_INDEX] + ) + return file_ctime in visible_dates + + +def validate_log_file_and_update_status(log_file_hash): + log_file = models.LogFile.objects.get(hash=log_file_hash) + collection = log_file.collection.acron3 + buffer_size, sample_size = _get_collection_validation_settings(collection) + + logging.info("Validating log file %s.", log_file.path) + validation_result = utils.validate_file( + path=log_file.path, + buffer_size=buffer_size, + sample_size=sample_size, + ) + _normalize_validation_result_for_storage(validation_result) + + _update_log_file_with_validation_result( + log_file=log_file, + validation_result=validation_result, + buffer_size=buffer_size, + sample_size=sample_size, + ) + + +def _get_collection_validation_settings( + collection, default_buffer_size=2048, default_sample_size=0.1 +): + col_configs = lmc_models.LogManagerCollectionConfig.objects.filter( + collection__acron3=collection + ).first() + + if not col_configs: + logging.warning( + "No LogManagerCollectionConfig found for collection %s. Using default values.", + collection, + ) + return default_buffer_size, default_sample_size + + return col_configs.buffer_size, col_configs.sample_size + + +def _normalize_validation_result_for_storage(validation_result): + if "datetimes" in validation_result.get("content", {}).get("summary", {}): + del validation_result["content"]["summary"]["datetimes"] + + if "probably_date" not in validation_result: + return + + probably_date = validation_result["probably_date"] + if isinstance(probably_date, dict): + logging.error("Error determining probably_date: %s", probably_date.get("error")) + validation_result["probably_date"] = None + return + + try: + validation_result["probably_date"] = date_utils.get_date_str(probably_date) + except (ValueError, AttributeError) as exc: + logging.error("Error serializing probably_date: %s", exc) + validation_result["probably_date"] = None + + +def _update_log_file_with_validation_result( + log_file, + validation_result, + buffer_size, + sample_size, +): + log_file.validation = validation_result + log_file.validation.update({"buffer_size": buffer_size, "sample_size": sample_size}) + + if validation_result.get("is_valid", {}).get("all", False): + log_file.date = validation_result.get("probably_date") or None + log_file.status = choices.LOG_FILE_STATUS_QUEUED + else: + log_file.status = choices.LOG_FILE_STATUS_INVALIDATED + + logging.info( + "Log file %s (%s) has status %s.", + log_file.path, + log_file.collection.acron3, + log_file.status, + ) + log_file.save() diff --git a/log_manager/tasks.py b/log_manager/tasks.py index 614106d..a04f22a 100644 --- a/log_manager/tasks.py +++ b/log_manager/tasks.py @@ -1,25 +1,15 @@ import logging -import os from celery import chord -from django.conf import settings -from collection.models import Collection from config import celery_app -from core.utils import date_utils +from config.collections import get_collection_parse_queue from core.utils.request_utils import _get_user -from log_manager_config import models as lmc_models -from metrics.services.resources import extract_celery_queue_name -from metrics.tasks import task_parse_logs +from log_manager.services import catalog, validation +from metrics.tasks.log_parsing import task_enqueue_log_parsing_jobs -from . import choices, models, utils -LOGFILE_STAT_RESULT_CTIME_INDEX = 9 - - -@celery_app.task( - bind=True, name="[Log Pipeline] 1. Search Logs (Manual)", queue="load" -) +@celery_app.task(bind=True, name="[Log Pipeline] 1. Search Logs (Manual)", queue="load") def task_search_log_files( self, collections=None, @@ -38,50 +28,12 @@ def task_search_log_files( """ _get_user(self.request, username=username, user_id=user_id) - from_date_str, until_date_str = date_utils.get_date_range_str( - from_date, until_date, days_to_go_back - ) - visible_dates = date_utils.get_date_objs_from_date_range( - from_date_str, until_date_str + catalog.catalog_log_files_from_configured_directories( + collections=collections, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, ) - supported_extensions = settings.SUPPORTED_LOGFILE_EXTENSIONS - if not supported_extensions: - logging.error("No supported log file extensions configured.") - - for collection_code in collections or Collection.acron3_list(): - collection = Collection.objects.get(acron3=collection_code) - directories = lmc_models.CollectionLogDirectory.objects.filter( - config__collection__acron3=collection_code, - active=True, - ) - if not directories: - logging.error( - "No CollectionLogDirectory found for collection %s.", collection_code - ) - - for directory in directories: - for root, _sub_dirs, files in os.walk(directory.path): - for name in files: - _name, extension = os.path.splitext(name) - if extension.lower() not in supported_extensions: - continue - - file_path = os.path.join(root, name) - file_stat = os.stat(file_path) - file_ctime = date_utils.get_date_obj_from_timestamp( - file_stat.st_ctime - ) - - logging.debug( - "Checking file %s with ctime %s.", file_path, file_ctime - ) - if file_ctime in visible_dates: - models.LogFile.create_or_update( - collection=collection, - path=file_path, - stat_result=file_stat, - hash=utils.hash_file(file_path), - ) if trigger_validation: task_validate_log_files.apply_async( @@ -122,47 +74,23 @@ def task_validate_log_files( When trigger_parse=True, one parse orchestration task is enqueued per collection and routed to the proper parse_ queue. """ - collection_codes = collections or Collection.acron3_list() - logging.info("Validating log files for collections: %s.", collection_codes) - - from_date_str, until_date_str = date_utils.get_date_range_str( - from_date, until_date, days_to_go_back - ) - visible_dates = date_utils.get_date_objs_from_date_range( - from_date_str, until_date_str + log_hashes_by_collection = validation.get_validation_candidate_hashes_by_collection( + collections=collections, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + ignore_date=ignore_date, + revalidate=revalidate, + status_list=status_list, ) - if not ignore_date: - if not visible_dates: - logging.warning("No visible dates found for log validation.") - return - logging.info("Interval: %s to %s.", visible_dates[0], visible_dates[-1]) - - status_filter = [choices.LOG_FILE_STATUS_CREATED] - if revalidate: - status_filter += status_list or [ - choices.LOG_FILE_STATUS_QUEUED, - choices.LOG_FILE_STATUS_INVALIDATED, - choices.LOG_FILE_STATUS_ERROR, - ] - - tasks_by_collection = {} - for collection_code in collection_codes: - tasks_by_collection[collection_code] = [] - log_files = models.LogFile.objects.filter( - status__in=status_filter, - collection__acron3=collection_code, - ) - for log_file in log_files: - if not ignore_date: - file_ctime = date_utils.get_date_obj_from_timestamp( - log_file.stat_result[LOGFILE_STAT_RESULT_CTIME_INDEX] - ) - if file_ctime not in visible_dates: - continue + if log_hashes_by_collection is None: + return - tasks_by_collection[collection_code].append( - task_validate_log_file.s(log_file.hash, user_id, username) - ) + tasks_by_collection = _build_validation_tasks( + log_hashes_by_collection=log_hashes_by_collection, + user_id=user_id, + username=username, + ) if trigger_parse: _enqueue_parse_after_validation( @@ -189,33 +117,7 @@ def task_validate_log_files( def task_validate_log_file(self, log_file_hash, user_id=None, username=None): """Validate a single LogFile and update its status.""" _get_user(self.request, username=username, user_id=user_id) - log_file = models.LogFile.objects.get(hash=log_file_hash) - collection = log_file.collection.acron3 - - buffer_size, sample_size = _fetch_validation_parameters(collection) - - logging.info("Validating log file %s.", log_file.path) - val_result = utils.validate_file( - path=log_file.path, buffer_size=buffer_size, sample_size=sample_size - ) - _clean_validation_result(val_result) - - log_file.validation = val_result - log_file.validation.update({"buffer_size": buffer_size, "sample_size": sample_size}) - - if val_result.get("is_valid", {}).get("all", False): - log_file.date = val_result.get("probably_date") or None - log_file.status = choices.LOG_FILE_STATUS_QUEUED - else: - log_file.status = choices.LOG_FILE_STATUS_INVALIDATED - - logging.info( - "Log file %s (%s) has status %s.", - log_file.path, - log_file.collection.acron3, - log_file.status, - ) - log_file.save() + validation.validate_log_file_and_update_status(log_file_hash) @celery_app.task(bind=True, name="[Log Pipeline] Daily Routine (Auto)", queue="load") @@ -227,6 +129,16 @@ def task_daily_log_ingestion_pipeline(self): task_search_log_files.apply_async(kwargs={"trigger_validation": True}) +def _build_validation_tasks(log_hashes_by_collection, user_id, username): + return { + collection_code: [ + task_validate_log_file.s(log_file_hash, user_id, username) + for log_file_hash in log_hashes + ] + for collection_code, log_hashes in log_hashes_by_collection.items() + } + + def _enqueue_parse_after_validation( tasks_by_collection, from_date, until_date, days_to_go_back, user_id, username ): @@ -243,7 +155,7 @@ def _enqueue_parse_after_validation( ) ) else: - task_parse_logs.apply_async( + task_enqueue_log_parsing_jobs.apply_async( **_build_parse_apply_kwargs( collection_code, from_date, @@ -266,7 +178,7 @@ def _build_parse_signature( user_id, username, ) - parse_callback = task_parse_logs.si(**apply_kwargs["kwargs"]) + parse_callback = task_enqueue_log_parsing_jobs.si(**apply_kwargs["kwargs"]) if apply_kwargs.get("queue"): parse_callback.set(queue=apply_kwargs["queue"]) return parse_callback @@ -276,7 +188,7 @@ def _build_parse_apply_kwargs( collection_code, from_date, until_date, days_to_go_back, user_id, username ): collections = [collection_code] - parse_queue = extract_celery_queue_name(collection_code) + parse_queue = get_collection_parse_queue(collection_code) apply_kwargs = { "kwargs": { "collections": collections, @@ -290,38 +202,3 @@ def _build_parse_apply_kwargs( "queue": parse_queue, } return apply_kwargs - - -def _fetch_validation_parameters( - collection, default_buffer_size=0.1, default_sample_size=2048 -): - col_configs = lmc_models.LogManagerCollectionConfig.objects.filter( - collection__acron3=collection - ).first() - if not col_configs: - logging.warning( - "No LogManagerCollectionConfig found for collection %s. Using default values.", - collection, - ) - return default_buffer_size, default_sample_size - return col_configs.buffer_size, col_configs.sample_size - - -def _clean_validation_result(val_result): - if "datetimes" in val_result.get("content", {}).get("summary", {}): - del val_result["content"]["summary"]["datetimes"] - - if "probably_date" not in val_result: - return - - probably_date = val_result["probably_date"] - if isinstance(probably_date, dict): - logging.error("Error determining probably_date: %s", probably_date.get("error")) - val_result["probably_date"] = None - return - - try: - val_result["probably_date"] = date_utils.get_date_str(probably_date) - except (ValueError, AttributeError) as exc: - logging.error("Error serializing probably_date: %s", exc) - val_result["probably_date"] = None diff --git a/log_manager/tests.py b/log_manager/tests.py deleted file mode 100644 index 8832e25..0000000 --- a/log_manager/tests.py +++ /dev/null @@ -1,89 +0,0 @@ -from unittest.mock import patch - -from django.db import IntegrityError -from django.test import TestCase - -from collection.models import Collection - -from . import choices, tasks -from .models import LogFile - - -class LogFileTests(TestCase): - def setUp(self): - self.collection = Collection.objects.create(acron3="books", acron2="bk") - - def test_create_or_update_creates_log_file(self): - log_file = LogFile.create_or_update( - collection=self.collection, - path="/tmp/new.log.gz", - stat_result={"size": 10}, - hash="1" * 32, - ) - - self.assertEqual(log_file.collection, self.collection) - self.assertEqual(log_file.path, "/tmp/new.log.gz") - self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_CREATED) - - def test_create_or_update_refetches_existing_log_after_integrity_error(self): - existing = LogFile.objects.create( - collection=self.collection, - path="/tmp/existing.log.gz", - stat_result={"size": 10}, - hash="1" * 32, - status=choices.LOG_FILE_STATUS_CREATED, - ) - - with patch.object(LogFile.objects, "get_or_create", side_effect=IntegrityError): - log_file = LogFile.create_or_update( - collection=self.collection, - path="/tmp/existing.log.gz", - stat_result={"size": 10}, - hash=existing.hash, - ) - - self.assertEqual(log_file.pk, existing.pk) - - -class ValidateLogFilesTaskTests(TestCase): - def test_validate_log_files_returns_for_empty_visible_date_range(self): - with patch("log_manager.tasks.task_validate_log_file.s") as mocked_signature: - result = tasks.task_validate_log_files.run( - collections=["books"], - from_date="2024-02-02", - until_date="2024-02-01", - ) - - self.assertIsNone(result) - mocked_signature.assert_not_called() - - def test_validate_log_files_routes_parse_callback_to_collection_parse_queue(self): - with patch("metrics.tasks.task_parse_logs.apply_async") as mocked_apply_async: - tasks.task_validate_log_files.run( - collections=["books"], - from_date="2024-02-01", - until_date="2024-02-02", - trigger_parse=True, - ) - - mocked_apply_async.assert_called_once() - self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small") - self.assertEqual( - mocked_apply_async.call_args.kwargs["kwargs"]["queue_name"], - "parse_small", - ) - - def test_validate_log_files_routes_each_collection_parse_to_its_queue(self): - with patch("metrics.tasks.task_parse_logs.apply_async") as mocked_apply_async: - tasks.task_validate_log_files.run( - collections=["books", "scl"], - from_date="2024-02-01", - until_date="2024-02-02", - trigger_parse=True, - ) - - calls = { - call.kwargs["kwargs"]["collections"][0]: call.kwargs["queue"] - for call in mocked_apply_async.call_args_list - } - self.assertEqual(calls, {"books": "parse_small", "scl": "parse_xlarge"}) diff --git a/log_manager/tests/__init__.py b/log_manager/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log_manager/tests/test_models.py b/log_manager/tests/test_models.py new file mode 100644 index 0000000..85eada8 --- /dev/null +++ b/log_manager/tests/test_models.py @@ -0,0 +1,44 @@ +from unittest.mock import patch + +from django.db import IntegrityError +from django.test import TestCase + +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile + + +class LogFileModelTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def test_create_or_update_creates_log_file(self): + log_file = LogFile.create_or_update( + collection=self.collection, + path="/tmp/new.log.gz", + stat_result={"size": 10}, + hash="1" * 32, + ) + + self.assertEqual(log_file.collection, self.collection) + self.assertEqual(log_file.path, "/tmp/new.log.gz") + self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_CREATED) + + def test_create_or_update_refetches_existing_after_integrity_error(self): + existing = LogFile.objects.create( + collection=self.collection, + path="/tmp/existing.log.gz", + stat_result={"size": 10}, + hash="1" * 32, + status=choices.LOG_FILE_STATUS_CREATED, + ) + + with patch.object(LogFile.objects, "get_or_create", side_effect=IntegrityError): + log_file = LogFile.create_or_update( + collection=self.collection, + path="/tmp/existing.log.gz", + stat_result={"size": 10}, + hash=existing.hash, + ) + + self.assertEqual(log_file.pk, existing.pk) diff --git a/log_manager/tests/test_tasks.py b/log_manager/tests/test_tasks.py new file mode 100644 index 0000000..79d1db7 --- /dev/null +++ b/log_manager/tests/test_tasks.py @@ -0,0 +1,53 @@ +from unittest.mock import patch + +from django.test import TestCase + +from log_manager import tasks + + +class ValidateLogFilesTaskTests(TestCase): + def test_returns_none_for_empty_date_range(self): + with patch("log_manager.tasks.task_validate_log_file.s") as mocked_signature: + result = tasks.task_validate_log_files.run( + collections=["books"], + from_date="2024-02-02", + until_date="2024-02-01", + ) + + self.assertIsNone(result) + mocked_signature.assert_not_called() + + def test_routes_parse_callback_to_collection_queue(self): + with patch( + "log_manager.tasks.task_enqueue_log_parsing_jobs.apply_async" + ) as mocked_apply_async: + tasks.task_validate_log_files.run( + collections=["books"], + from_date="2024-02-01", + until_date="2024-02-02", + trigger_parse=True, + ) + + mocked_apply_async.assert_called_once() + self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small") + self.assertEqual( + mocked_apply_async.call_args.kwargs["kwargs"]["queue_name"], + "parse_small", + ) + + def test_routes_each_collection_to_its_queue(self): + with patch( + "log_manager.tasks.task_enqueue_log_parsing_jobs.apply_async" + ) as mocked_apply_async: + tasks.task_validate_log_files.run( + collections=["books", "scl"], + from_date="2024-02-01", + until_date="2024-02-02", + trigger_parse=True, + ) + + calls = { + call.kwargs["kwargs"]["collections"][0]: call.kwargs["queue"] + for call in mocked_apply_async.call_args_list + } + self.assertEqual(calls, {"books": "parse_small", "scl": "parse_xlarge"}) diff --git a/log_manager/tests/test_validation.py b/log_manager/tests/test_validation.py new file mode 100644 index 0000000..957faf0 --- /dev/null +++ b/log_manager/tests/test_validation.py @@ -0,0 +1,81 @@ +import tempfile +from datetime import date +from unittest.mock import patch + +from django.test import TestCase + +from collection.models import Collection +from log_manager import choices, utils +from log_manager.models import LogFile +from log_manager.services import validation + + +class ValidationServiceTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="scl", acron2="sc") + + def test_validation_settings_defaults_match_validator_arguments(self): + buffer_size, sample_size = validation._get_collection_validation_settings( + self.collection.acron3 + ) + + self.assertEqual(buffer_size, 2048) + self.assertEqual(sample_size, 0.1) + + @patch("log_manager.utils.validator.pipeline_validate") + @patch("log_manager.utils.validator.get_total_lines", return_value=10) + def test_validate_file_clamps_sample_size_to_avoid_zero_range_step( + self, mock_get_total_lines, mock_pipeline_validate + ): + utils.validate_file("/tmp/access.log", sample_size=2048, buffer_size=2048) + + mock_get_total_lines.assert_called_once_with( + path="/tmp/access.log", + buffer_size=2048, + ) + self.assertEqual(mock_pipeline_validate.call_args.kwargs["sample_size"], 1.0) + + @patch("log_manager.utils.validator.validate_path_name", return_value={"all": True}) + def test_validate_file_returns_invalid_result_for_empty_log( + self, mock_validate_path_name + ): + with tempfile.NamedTemporaryFile("w", encoding="utf-8") as tmp_file: + path = tmp_file.name + + result = utils.validate_file(path, sample_size=1.0, buffer_size=2048) + + self.assertFalse(result["is_valid"]["all"]) + self.assertEqual( + result["content"]["summary"]["total_lines"]["error"], + "File is empty", + ) + self.assertIsNone(result["probably_date"]) + + @patch("log_manager.services.validation.utils.validate_file") + def test_validate_log_file_updates_status_and_normalizes_result( + self, mock_validate_file + ): + log_file = LogFile.objects.create( + collection=self.collection, + path="/tmp/access.log", + stat_result={"size": 10}, + hash="2" * 32, + status=choices.LOG_FILE_STATUS_CREATED, + ) + mock_validate_file.return_value = { + "probably_date": date(2026, 5, 10), + "is_valid": {"all": True}, + "content": { + "summary": { + "datetimes": ["2026-05-10T00:00:00"], + }, + }, + } + + validation.validate_log_file_and_update_status(log_file.hash) + + log_file.refresh_from_db() + self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_QUEUED) + self.assertEqual(log_file.date, date(2026, 5, 10)) + self.assertNotIn("datetimes", log_file.validation["content"]["summary"]) + self.assertEqual(log_file.validation["probably_date"], "2026-05-10") diff --git a/log_manager/utils.py b/log_manager/utils.py index c7dd2db..16a996f 100644 --- a/log_manager/utils.py +++ b/log_manager/utils.py @@ -2,7 +2,7 @@ import hashlib from collections import deque -from scielo_log_validator import validator +from scielo_log_validator import exceptions, validator def hash_file(path, num_lines=500): @@ -27,28 +27,90 @@ def hash_file(path, num_lines=500): opener = gzip.open if _is_gzip(path) else open - with opener(path, 'rb') as file: - first_lines = b''.join([file.readline() for _ in range(num_lines)]) + with opener(path, "rb") as file: + first_lines = b"".join([file.readline() for _ in range(num_lines)]) md5_hash.update(first_lines) tail = deque(maxlen=num_lines) for line in file: tail.append(line) - md5_hash.update(b''.join(tail)) + md5_hash.update(b"".join(tail)) return md5_hash.hexdigest() def _is_gzip(path): - with open(path, 'rb') as f: - return f.read(2) == b'\x1f\x8b' + with open(path, "rb") as f: + return f.read(2) == b"\x1f\x8b" + + +def validate_file( + path, + sample_size=0.1, + buffer_size=2048, + days_delta=5, + apply_path_validation=True, + apply_content_validation=True, +): + if apply_content_validation: + if _is_empty_log_file(path, buffer_size): + return _empty_log_validation_result(path, apply_path_validation) + + sample_size = _safe_sample_size(path, sample_size, buffer_size) -def validate_file(path, sample_size=0.1, buffer_size=2048, days_delta=5, apply_path_validation=True, apply_content_validation=True): return validator.pipeline_validate( - path=path, + path=path, sample_size=sample_size, buffer_size=buffer_size, days_delta=days_delta, apply_path_validation=apply_path_validation, apply_content_validation=apply_content_validation, ) + + +def _is_empty_log_file(path, buffer_size): + try: + opener = gzip.open if _is_gzip(path) else open + with opener(path, "rb") as file: + return file.readline() == b"" + except OSError: + return False + + +def _empty_log_validation_result(path, apply_path_validation): + result = { + "mode": { + "path_validation": apply_path_validation, + "content_validation": True, + }, + "content": {"summary": {"total_lines": {"error": "File is empty"}}}, + "is_valid": {"ips": False, "dates": False, "all": False}, + "probably_date": None, + } + + if apply_path_validation: + result["path"] = validator.validate_path_name(path) + + return result + + +def _safe_sample_size(path, sample_size, buffer_size): + try: + total_lines = validator.get_total_lines(path=path, buffer_size=buffer_size) + except ( + exceptions.TruncatedLogFileError, + exceptions.InvalidLogFileMimeError, + exceptions.LogFileIsEmptyError, + ): + return sample_size + + if total_lines <= 1: + return 1.0 + + if sample_size >= 1.0: + return 1.0 + + if int(total_lines * sample_size) <= 0: + return 1.0 / total_lines + + return sample_size diff --git a/log_manager/views.py b/log_manager/views.py index 91ea44a..60f00ef 100644 --- a/log_manager/views.py +++ b/log_manager/views.py @@ -1,3 +1 @@ -from django.shortcuts import render - # Create your views here. diff --git a/log_manager/wagtail_hooks.py b/log_manager/wagtail_hooks.py index 1548ad3..cf7e908 100644 --- a/log_manager/wagtail_hooks.py +++ b/log_manager/wagtail_hooks.py @@ -1,13 +1,12 @@ from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup from wagtail.snippets.models import register_snippet +from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup from config.menu import get_menu_order +from log_manager.models import LogFile from log_manager_config.wagtail_hooks import LogManagerCollectionConfigSnippetViewSet from metrics.wagtail_hooks import DailyMetricJobSnippetViewSet -from log_manager.models import LogFile - class LogFileSnippetViewSet(SnippetViewSet): model = LogFile @@ -16,27 +15,27 @@ class LogFileSnippetViewSet(SnippetViewSet): menu_order = 500 list_display = ( "path", - "collection", - "status", + "collection", + "status", "date", "validation", "summary", "last_processed_line", "parse_heartbeat_at", - "hash" + "hash", ) list_filter = ("status", "collection", "date") search_fields = ("path", "hash", "collection__acron3", "collection__main_name") class LogSnippetViewSetGroup(SnippetViewSetGroup): - menu_name = 'log_manager' + menu_name = "log_manager" menu_label = _("Log Manager") menu_icon = "folder-open-inverse" menu_order = get_menu_order("log_manager") items = ( LogManagerCollectionConfigSnippetViewSet, - LogFileSnippetViewSet, + LogFileSnippetViewSet, DailyMetricJobSnippetViewSet, ) diff --git a/log_manager_config/admin.py b/log_manager_config/admin.py index 8c38f3f..846f6b4 100644 --- a/log_manager_config/admin.py +++ b/log_manager_config/admin.py @@ -1,3 +1 @@ -from django.contrib import admin - # Register your models here. diff --git a/log_manager_config/exceptions.py b/log_manager_config/exceptions.py index 0a6a6a9..de5e309 100644 --- a/log_manager_config/exceptions.py +++ b/log_manager_config/exceptions.py @@ -1,8 +1,10 @@ class UndefinedCollectionLogDirectoryError(Exception): ... + class UndefinedCollectionEmailError(Exception): ... + class UndefinedSupportedLogFile(Exception): ... diff --git a/log_manager_config/models.py b/log_manager_config/models.py index 35b5f90..f8fc106 100644 --- a/log_manager_config/models.py +++ b/log_manager_config/models.py @@ -3,39 +3,37 @@ from django.db import models from django.utils import timezone from django.utils.translation import gettext_lazy as _ - -from modelcluster.models import ClusterableModel from modelcluster.fields import ParentalKey -from wagtail.models import Orderable +from modelcluster.models import ClusterableModel from wagtail.admin.panels import FieldPanel, InlinePanel +from wagtail.models import Orderable from wagtailautocomplete.edit_handlers import AutocompletePanel from collection.models import Collection from core.models import CommonControlField - class LogManagerCollectionConfig(ClusterableModel, CommonControlField): collection = models.OneToOneField( Collection, - verbose_name=_('Collection'), + verbose_name=_("Collection"), on_delete=models.CASCADE, - related_name="log_manager_config" + related_name="log_manager_config", ) sample_size = models.FloatField( - verbose_name=_('Sample Size'), + verbose_name=_("Sample Size"), blank=False, null=False, default=0.1, ) buffer_size = models.IntegerField( - verbose_name=_('Buffer Size'), + verbose_name=_("Buffer Size"), blank=False, null=False, default=2048, ) expected_logs_per_day = models.IntegerField( - verbose_name=_('Expected Logs Per Day'), + verbose_name=_("Expected Logs Per Day"), default=1, ) @@ -49,17 +47,17 @@ class LogManagerCollectionConfig(ClusterableModel, CommonControlField): ] def __str__(self): - return f'{self.collection.acron3} Config' + return f"{self.collection.acron3} Config" class Meta: - verbose_name = _('Log Manager Collection Config') - verbose_name_plural = _('Log Manager Collection Configs') + verbose_name = _("Log Manager Collection Config") + verbose_name_plural = _("Log Manager Collection Configs") @classmethod def load(cls, data, user): for item in data: try: - collection = Collection.objects.get(acron3=item.get('acronym')) + collection = Collection.objects.get(acron3=item.get("acronym")) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -67,9 +65,9 @@ def load(cls, data, user): cls.create_or_update( user=user, collection=collection, - sample_size=item.get('sample_size', 0.1), - buffer_size=item.get('buffer_size', 2048), - expected_logs_per_day=item.get('quantity', 1), + sample_size=item.get("sample_size", 0.1), + buffer_size=item.get("buffer_size", 2048), + expected_logs_per_day=item.get("quantity", 1), ) @classmethod @@ -85,58 +83,59 @@ def create_or_update( if created: obj.creator = user obj.created = timezone.now() - + obj.updated_by = user obj.updated = timezone.now() obj.sample_size = sample_size obj.buffer_size = buffer_size obj.expected_logs_per_day = expected_logs_per_day obj.save() - logging.info(f'Config for {collection.acron3} updated.') + logging.info(f"Config for {collection.acron3} updated.") return obj - class CollectionLogDirectory(Orderable, CommonControlField): config = ParentalKey( - 'LogManagerCollectionConfig', - related_name='directories', + "LogManagerCollectionConfig", + related_name="directories", on_delete=models.CASCADE, null=True, blank=True, ) path = models.CharField( - verbose_name=_('Path'), - max_length=255, - blank=False, + verbose_name=_("Path"), + max_length=255, + blank=False, null=False, ) directory_name = models.CharField( - verbose_name=_('Directory Name'), - max_length=255, + verbose_name=_("Directory Name"), + max_length=255, blank=True, null=True, ) active = models.BooleanField( - verbose_name=_('Active'), + verbose_name=_("Active"), default=True, ) translator_class = models.CharField( - verbose_name=_('URL Translator Class'), + verbose_name=_("URL Translator Class"), blank=False, null=False, - default='classic', + default="classic", ) def __str__(self): - return f'{self.config.collection} - {self.path} - {self.directory_name}' - + return f"{self.config.collection} - {self.path} - {self.directory_name}" + @classmethod def load(cls, data, user): for item in data: try: - collection = Collection.objects.get(acron3=item.get('acronym')) - config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection) + collection = Collection.objects.get(acron3=item.get("acronym")) + config, _ = LogManagerCollectionConfig.objects.get_or_create( + collection=collection + ) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -145,10 +144,10 @@ def load(cls, data, user): cls.create_or_update( user=user, config=config, - directory_name=item.get('directory_name'), - path=item.get('path'), - active=item.get('active', True), - translator_class=item.get('translator_class', 'classic'), + directory_name=item.get("directory_name"), + path=item.get("path"), + active=item.get("active", True), + translator_class=item.get("translator_class", "classic"), ) @classmethod @@ -159,7 +158,7 @@ def create_or_update( directory_name, path, active, - translator_class='classic', + translator_class="classic", ): try: obj = cls.objects.get(config=config, path=path) @@ -168,66 +167,69 @@ def create_or_update( obj.creator = user obj.created = timezone.now() obj.config = config - + obj.updated_by = user obj.updated = timezone.now() obj.directory_name = directory_name obj.path = path obj.active = active - obj.translator_class = translator_class or 'classic' - + obj.translator_class = translator_class or "classic" + obj.save() - logging.info(f'{config.collection.acron3} - {directory_name} - {path}') + logging.info(f"{config.collection.acron3} - {directory_name} - {path}") return obj class Meta: - verbose_name = _('Collection Log Directory') - verbose_name_plural = _('Collection Log Directories') + verbose_name = _("Collection Log Directory") + verbose_name_plural = _("Collection Log Directories") constraints = [ - models.UniqueConstraint(fields=['config', 'path'], name='unique_config_path') + models.UniqueConstraint( + fields=["config", "path"], name="unique_config_path" + ) ] - class CollectionEmail(Orderable, CommonControlField): config = ParentalKey( - 'LogManagerCollectionConfig', - related_name='emails', + "LogManagerCollectionConfig", + related_name="emails", on_delete=models.CASCADE, null=True, blank=True, ) name = models.CharField( - verbose_name=_('Name'), - max_length=255, + verbose_name=_("Name"), + max_length=255, blank=True, null=True, ) position = models.CharField( - verbose_name=_('Position'), - max_length=255, + verbose_name=_("Position"), + max_length=255, blank=True, null=True, ) email = models.EmailField( - verbose_name=_('E-mail'), + verbose_name=_("E-mail"), blank=False, null=False, ) active = models.BooleanField( - verbose_name=_('Active'), + verbose_name=_("Active"), default=True, ) def __str__(self): - return f'{self.email} - {self.name}' - + return f"{self.email} - {self.name}" + @classmethod def load(cls, data, user): for item in data: try: - collection = Collection.objects.get(acron3=item.get('acronym')) - config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection) + collection = Collection.objects.get(acron3=item.get("acronym")) + config, _ = LogManagerCollectionConfig.objects.get_or_create( + collection=collection + ) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -236,10 +238,10 @@ def load(cls, data, user): cls.create_or_update( user=user, config=config, - email=item.get('e-mail'), - name=item.get('name'), - position=item.get('position'), - active=item.get('active', True), + email=item.get("e-mail"), + name=item.get("name"), + position=item.get("position"), + active=item.get("active", True), ) @classmethod @@ -262,19 +264,20 @@ def create_or_update( obj.email = email obj.updated_by = user - obj.updated = timezone.now() + obj.updated = timezone.now() obj.name = name obj.position = position obj.active = active - + obj.save() - logging.info(f'{config.collection.acron3} - {name} - {position} - {email}') + logging.info(f"{config.collection.acron3} - {name} - {position} - {email}") return obj - + class Meta: - verbose_name = _('Collection Email') - verbose_name_plural = _('Collection Emails') + verbose_name = _("Collection Email") + verbose_name_plural = _("Collection Emails") constraints = [ - models.UniqueConstraint(fields=['config', 'email'], name='unique_config_email') + models.UniqueConstraint( + fields=["config", "email"], name="unique_config_email" + ) ] - diff --git a/log_manager_config/tasks.py b/log_manager_config/tasks.py index 415dbf9..6c36df3 100644 --- a/log_manager_config/tasks.py +++ b/log_manager_config/tasks.py @@ -1,23 +1,27 @@ -from django.conf import settings - from config import celery_app -from config.collections import COLLECTION_SIZE_SAMPLE_MAP, LOG_MANAGER_SEED_DATA +from config.collections import ( + COLLECTION_SIZE_SAMPLE_MAP, + LOG_MANAGER_SEED_DATA, + get_collection_size, +) from core.utils.request_utils import _get_user -from . import models +from log_manager_config import models -@celery_app.task(bind=True, name='[Log Pipeline] Load Log Manager Settings (Seed)') -def task_load_log_manager_collection_settings(self, data=None, user_id=None, username=None): +@celery_app.task(bind=True, name="[Log Pipeline] Load Log Manager Settings (Seed)") +def task_load_log_manager_collection_settings( + self, data=None, user_id=None, username=None +): user = _get_user(self.request, username=username, user_id=user_id) if not data: data = LOG_MANAGER_SEED_DATA for i in data: - size = getattr(settings, 'COLLECTION_ACRON3_SIZE_MAP', {}).get(i['acronym'], 'small') - i['sample_size'] = COLLECTION_SIZE_SAMPLE_MAP.get(size, 1.0) - i['buffer_size'] = 2048 + size = get_collection_size(i["acronym"]) + i["sample_size"] = COLLECTION_SIZE_SAMPLE_MAP.get(size, 1.0) + i["buffer_size"] = 2048 models.LogManagerCollectionConfig.load(data, user) models.CollectionLogDirectory.load(data, user) diff --git a/log_manager_config/tests.py b/log_manager_config/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/log_manager_config/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/log_manager_config/tests/__init__.py b/log_manager_config/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log_manager_config/tests/test_models.py b/log_manager_config/tests/test_models.py new file mode 100644 index 0000000..6c1dad3 --- /dev/null +++ b/log_manager_config/tests/test_models.py @@ -0,0 +1,84 @@ +from django.test import TestCase + +from collection.models import Collection +from core.users.tests.factories import UserFactory +from log_manager_config.models import CollectionLogDirectory, LogManagerCollectionConfig + + +class LogManagerCollectionConfigTests(TestCase): + def setUp(self): + self.user = UserFactory() + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def test_create_or_update_creates_config(self): + config = LogManagerCollectionConfig.create_or_update( + user=self.user, + collection=self.collection, + sample_size=0.2, + buffer_size=4096, + expected_logs_per_day=3, + ) + + self.assertEqual(config.collection, self.collection) + self.assertEqual(config.sample_size, 0.2) + self.assertEqual(config.buffer_size, 4096) + self.assertEqual(config.expected_logs_per_day, 3) + + def test_create_or_update_updates_existing(self): + LogManagerCollectionConfig.create_or_update( + user=self.user, + collection=self.collection, + sample_size=0.1, + buffer_size=2048, + expected_logs_per_day=1, + ) + config = LogManagerCollectionConfig.create_or_update( + user=self.user, + collection=self.collection, + sample_size=0.5, + buffer_size=8192, + expected_logs_per_day=5, + ) + + self.assertEqual(LogManagerCollectionConfig.objects.count(), 1) + self.assertEqual(config.sample_size, 0.5) + self.assertEqual(config.buffer_size, 8192) + + +class CollectionLogDirectoryTests(TestCase): + def setUp(self): + self.user = UserFactory() + self.collection = Collection.objects.create(acron3="scl", acron2="sc") + self.config = LogManagerCollectionConfig.create_or_update( + user=self.user, + collection=self.collection, + sample_size=0.1, + buffer_size=2048, + expected_logs_per_day=1, + ) + + def test_create_or_update_creates_directory(self): + directory = CollectionLogDirectory.create_or_update( + user=self.user, + config=self.config, + directory_name="classic-logs", + path="/data/logs/scl", + active=True, + translator_class="classic", + ) + + self.assertEqual(directory.config, self.config) + self.assertEqual(directory.path, "/data/logs/scl") + self.assertEqual(directory.translator_class, "classic") + + def test_translator_class_defaults_to_classic(self): + directory = CollectionLogDirectory.create_or_update( + user=self.user, + config=self.config, + directory_name="logs", + path="/data/logs/scl", + active=True, + translator_class=None, + ) + + self.assertEqual(directory.translator_class, "classic") diff --git a/log_manager_config/views.py b/log_manager_config/views.py index 91ea44a..60f00ef 100644 --- a/log_manager_config/views.py +++ b/log_manager_config/views.py @@ -1,3 +1 @@ -from django.shortcuts import render - # Create your views here. diff --git a/log_manager_config/wagtail_hooks.py b/log_manager_config/wagtail_hooks.py index f91c0b1..100fda3 100644 --- a/log_manager_config/wagtail_hooks.py +++ b/log_manager_config/wagtail_hooks.py @@ -3,6 +3,7 @@ from log_manager_config.models import LogManagerCollectionConfig + class LogManagerCollectionConfigSnippetViewSet(SnippetViewSet): model = LogManagerCollectionConfig menu_label = _("Log Manager Configurations") @@ -16,9 +17,5 @@ class LogManagerCollectionConfigSnippetViewSet(SnippetViewSet): "expected_logs_per_day", "updated", ) - list_filter = ( - "collection", - ) - search_fields = ( - "collection__acron3", - ) + list_filter = ("collection",) + search_fields = ("collection__acron3",) diff --git a/metrics/admin.py b/metrics/admin.py deleted file mode 100755 index 8c38f3f..0000000 --- a/metrics/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.contrib import admin - -# Register your models here. diff --git a/metrics/counter/__init__.py b/metrics/counter/__init__.py index c9afd92..e69de29 100644 --- a/metrics/counter/__init__.py +++ b/metrics/counter/__init__.py @@ -1,22 +0,0 @@ -from .access import ( - extract_item_access_data, - is_valid_item_access_data, - update_results_with_item_access_data, -) -from .documents import convert_raw_results_to_index_documents -from .identifiers import ( - generate_item_access_id, - generate_month_document_id, - generate_user_session_id, - generate_year_document_id, -) -from .parser import ( - extract_date_from_validation_dict, - translator_class_name_to_obj, -) -from metrics.opensearch.names import ( - extract_access_month, - extract_access_year, - generate_month_index_name, - generate_year_index_name, -) diff --git a/metrics/counter/access.py b/metrics/counter/access.py deleted file mode 100644 index 65f9b27..0000000 --- a/metrics/counter/access.py +++ /dev/null @@ -1,533 +0,0 @@ -import re -from urllib.parse import unquote, urlparse - -from scielo_usage_counter.values import ( - CONTENT_TYPE_UNDEFINED, - DEFAULT_SCIELO_ISSN, - MEDIA_LANGUAGE_UNDEFINED, - MEDIA_FORMAT_UNDEFINED, -) - -from core.utils.standardizer import ( - standardize_language_code, - standardize_pid_generic, - standardize_pid_v2, - standardize_pid_v3, - standardize_year_of_publication, -) -from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour -from metrics.counter.identifiers import ( - generate_item_access_id, - generate_user_session_id, -) - - -def extract_item_access_data(collection_acron3: str, translated_url: dict): - if not translated_url or not isinstance(translated_url, dict): - return {} - - source_type = _extract_source_type(collection_acron3, translated_url) - source_id = _extract_source_id(collection_acron3, translated_url, source_type) - scielo_issn = _extract_scielo_issn(translated_url, source_type, source_id) - document_type = _extract_document_type( - collection_acron3, translated_url, source_type - ) - publication_year = _safe_standardize( - standardize_year_of_publication, - translated_url.get("year_of_publication"), - ) - source_access_type = translated_url.get("source_access_type") - - return { - "collection": collection_acron3, - "source_type": source_type, - "source_id": source_id, - "scielo_issn": scielo_issn, - "document_type": document_type, - "document_title": _extract_document_title(translated_url, document_type), - "pid_v2": _safe_standardize(standardize_pid_v2, translated_url.get("pid_v2")), - "pid_v3": _safe_standardize(standardize_pid_v3, translated_url.get("pid_v3")), - "pid_generic": _safe_standardize( - standardize_pid_generic, - translated_url.get("pid_generic"), - ), - "title_pid_generic": _safe_standardize( - standardize_pid_generic, - translated_url.get("title_pid_generic"), - ), - "segment_pid_generics": _standardize_pid_generic_list( - translated_url.get("segment_pid_generics"), - ), - "media_language": _safe_standardize( - standardize_language_code, - translated_url.get("media_language"), - default="un", - ), - "media_format": translated_url.get("media_format"), - "content_type": translated_url.get("content_type"), - "access_url": translated_url.get("access_url") - or translated_url.get("normalized_url"), - "publication_year": publication_year, - "counter_access_type": _counter_access_type(source_access_type), - "access_method": "Regular", - "source_main_title": _extract_source_title(translated_url), - "source_subject_area_capes": translated_url.get("source_subject_area_capes") - or translated_url.get("journal_subject_area_capes"), - "source_subject_area_wos": translated_url.get("source_subject_area_wos") - or translated_url.get("journal_subject_area_wos"), - "source_acronym": translated_url.get("source_acronym") - or translated_url.get("journal_acronym"), - "source_publisher_name": translated_url.get("source_publisher_name") - or translated_url.get("journal_publisher_name"), - "source_access_type": source_access_type, - "source_identifiers": _extract_source_identifiers( - translated_url, source_id, source_type - ), - "source_city": translated_url.get("source_city"), - "source_country": translated_url.get("source_country"), - } - - -def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False): - if not isinstance(data, dict): - return False, { - "message": "Invalid data format. Expected a dictionary.", - "code": "invalid_format", - } - - scielo_issn = data.get("scielo_issn") - source_id = data.get("source_id") - source_type = data.get("source_type") - document_type = data.get("document_type") or "article" - media_format = data.get("media_format") - media_language = data.get("media_language") - content_type = data.get("content_type") - pid_v2 = data.get("pid_v2") - pid_v3 = data.get("pid_v3") - pid_generic = data.get("pid_generic") - has_source_identity = bool(source_id) or bool( - scielo_issn and scielo_issn != DEFAULT_SCIELO_ISSN - ) - has_media_language = bool( - media_language and media_language != MEDIA_LANGUAGE_UNDEFINED - ) - has_pid = bool(pid_v2 or pid_v3 or pid_generic) - - if not all( - [ - media_format and media_format != MEDIA_FORMAT_UNDEFINED, - content_type and content_type != CONTENT_TYPE_UNDEFINED, - has_pid, - ] - ): - return False, { - "message": "Missing required fields in item access data.", - "code": "missing_fields", - } - - if document_type in {"article", "book", "chapter"} and not has_media_language: - return False, { - "message": "Missing media language in item access data.", - "code": "missing_fields", - } - - if document_type == "article" and not has_source_identity: - return False, { - "message": "Missing article source identity.", - "code": "missing_fields", - } - - if document_type in {"book", "chapter"} and not source_id: - return False, { - "message": "Missing book source identity.", - "code": "missing_fields", - } - - if document_type in {"preprint", "dataset"} and not pid_generic: - return False, { - "message": "Missing generic PID in item access data.", - "code": "missing_fields", - } - - if utm and not ignore_utm_validation: - if ( - source_type == "journal" - and scielo_issn - and scielo_issn != DEFAULT_SCIELO_ISSN - and not utm.is_valid_code(scielo_issn, utm.sources_metadata["issn_set"]) - ): - return False, { - "message": f"Invalid scielo_issn: {scielo_issn}", - "code": "invalid_scielo_issn", - } - - if ( - source_type - and source_type != "journal" - and source_id - and source_id not in utm.sources_metadata.get("source_id_to_type", {}) - ): - return False, { - "message": f"Invalid source_id: {source_id}", - "code": "invalid_source_id", - } - - if pid_v2 and not utm.is_valid_code(pid_v2, utm.documents_metadata["pid_set"]): - return False, { - "message": f"Invalid pid_v2: {pid_v2}", - "code": "invalid_pid_v2", - } - - if pid_v3 and not utm.is_valid_code(pid_v3, utm.documents_metadata["pid_set"]): - return False, { - "message": f"Invalid pid_v3: {pid_v3}", - "code": "invalid_pid_v3", - } - - if pid_generic and not utm.is_valid_code( - pid_generic, utm.documents_metadata["pid_set"] - ): - return False, { - "message": f"Invalid pid_generic: {pid_generic}", - "code": "invalid_pid_generic", - } - - return True, {"message": "Item access data is valid.", "code": "valid"} - - -def update_results_with_item_access_data( - results: dict, item_access_data: dict, line: dict -): - col_acron3 = item_access_data.get("collection") - source_key = ( - item_access_data.get("source_id") - or item_access_data.get("scielo_issn") - or item_access_data.get("source_type") - or col_acron3 - ) - pid_v2 = item_access_data.get("pid_v2") - pid_v3 = item_access_data.get("pid_v3") - media_format = item_access_data.get("media_format") - content_language = item_access_data.get("media_language") - content_type = item_access_data.get("content_type") - access_url = item_access_data.get("access_url") or _normalize_access_url( - line.get("url") - ) - - client_name = line.get("client_name") - client_version = line.get("client_version") - local_datetime = line.get("local_datetime") - access_country_code = line.get("country_code") - ip_address = line.get("ip_address") - - truncated_datetime = truncate_datetime_to_hour(local_datetime) - ms_key = extract_minute_second_key(local_datetime) - if truncated_datetime is None or ms_key is None: - raise ValueError("Invalid local_datetime in parsed log line.") - - access_date = truncated_datetime.strftime("%Y-%m-%d") - access_year = access_date[:4] - access_month = access_date[:7].replace("-", "") - - user_session_id = generate_user_session_id( - client_name, - client_version, - ip_address, - truncated_datetime, - ) - - for access_target in _iter_access_targets(item_access_data): - item_access_id = generate_item_access_id( - user_session_id=user_session_id, - col_acron3=col_acron3, - source_key=source_key, - pid_v2=pid_v2, - pid_v3=pid_v3, - pid_generic=access_target.get("pid_generic"), - content_language=content_language, - access_country_code=access_country_code, - media_format=media_format, - content_type=content_type, - ) - - if item_access_id not in results: - results[item_access_id] = { - "collection": col_acron3, - "source_key": source_key, - "document_type": access_target.get("document_type"), - "pid_v2": pid_v2, - "pid_v3": pid_v3, - "pid_generic": access_target.get("pid_generic"), - "document": _build_document(item_access_data), - "title_pid_generic": ( - item_access_data.get("title_pid_generic") - or access_target.get("pid_generic") - ), - "user_session_id": user_session_id, - "click_timestamps": {ms_key: 0}, - "click_timestamps_by_url": {}, - "access_url": access_url, - "media_format": media_format, - "content_language": content_language, - "content_type": content_type, - "access_country_code": access_country_code, - "access_date": access_date, - "access_year": access_year, - "access_month": access_month, - "publication_year": item_access_data.get("publication_year"), - "counter_access_type": item_access_data.get("counter_access_type") - or "Open", - "access_method": item_access_data.get("access_method") or "Regular", - "source": { - "source_type": item_access_data.get("source_type"), - "source_id": item_access_data.get("source_id"), - "scielo_issn": item_access_data.get("scielo_issn"), - "main_title": item_access_data.get("source_main_title"), - "identifiers": item_access_data.get("source_identifiers"), - "access_type": item_access_data.get("source_access_type"), - "city": item_access_data.get("source_city"), - "country": item_access_data.get("source_country"), - "subject_area_capes": item_access_data.get( - "source_subject_area_capes" - ), - "subject_area_wos": item_access_data.get("source_subject_area_wos"), - "acronym": item_access_data.get("source_acronym"), - "publisher_name": item_access_data.get("source_publisher_name"), - }, - } - - if ms_key not in results[item_access_id]["click_timestamps"]: - results[item_access_id]["click_timestamps"][ms_key] = 0 - - results[item_access_id]["click_timestamps"][ms_key] += 1 - - access_url_key = access_url or _fallback_access_url_key( - access_target.get("pid_generic"), - media_format, - content_type, - ) - timestamps_by_url = results[item_access_id].setdefault( - "click_timestamps_by_url", {} - ) - url_timestamps = timestamps_by_url.setdefault(access_url_key, {}) - if ms_key not in url_timestamps: - url_timestamps[ms_key] = 0 - url_timestamps[ms_key] += 1 - - -def _extract_source_type(collection_acron3, translated_url): - source_type = translated_url.get("source_type") - if source_type: - return source_type - - if collection_acron3 == "preprints": - return "preprint_server" - - if collection_acron3 == "data": - return "data_repository" - - if collection_acron3 == "books": - return "book" - - if translated_url.get("book_id"): - return "book" - - if ( - translated_url.get("scielo_issn") - and translated_url.get("scielo_issn") != DEFAULT_SCIELO_ISSN - ): - return "journal" - - if translated_url.get("journal_acronym") or translated_url.get( - "journal_main_title" - ): - return "journal" - - return "other" - - -def _extract_source_id(collection_acron3, translated_url, source_type): - source_id = translated_url.get("source_id") - if source_id: - return source_id - - if source_type == "preprint_server": - return translated_url.get("preprint_server_id") or "scielo-preprints" - - if source_type == "data_repository": - return translated_url.get("repository_id") or "scielo-data" - - if source_type == "book": - return ( - translated_url.get("book_id") - or _extract_book_id_from_pid(translated_url.get("title_pid_generic")) - or _extract_book_id_from_pid(translated_url.get("pid_generic")) - ) - - if source_type == "journal": - return translated_url.get("scielo_issn") - - return None - - -def _extract_scielo_issn(translated_url, source_type, source_id): - scielo_issn = translated_url.get("scielo_issn") - if scielo_issn: - return scielo_issn - - if source_type == "journal" and source_id: - return source_id - - if source_type in {"book", "other"} or translated_url.get("book_id"): - return DEFAULT_SCIELO_ISSN - - return None - - -def _extract_source_title(translated_url): - return ( - translated_url.get("source_main_title") - or translated_url.get("journal_main_title") - or translated_url.get("book_title") - ) - - -def _extract_document_title(translated_url, document_type): - if document_type == "chapter": - return translated_url.get("chapter_title") - if document_type == "book": - return translated_url.get("book_title") - return ( - translated_url.get("document_title") - or translated_url.get("article_title") - or translated_url.get("title") - ) - - -def _extract_document_type(collection_acron3, translated_url, source_type): - document_type = translated_url.get("document_type") - if document_type: - return document_type - - if collection_acron3 == "preprints": - return "preprint" - - if collection_acron3 == "data": - return "dataset" - - if collection_acron3 == "books" or source_type == "book": - pid_generic = translated_url.get("pid_generic") or "" - if translated_url.get("chapter_id") or "/CHAPTER:" in pid_generic.upper(): - return "chapter" - if translated_url.get("book_id"): - return "book" - return "book" - - if source_type == "journal": - return "article" - - return "article" - - -def _extract_source_identifiers(translated_url, source_id, source_type): - identifiers = translated_url.get("source_identifiers") - if isinstance(identifiers, dict): - compact = { - key: value - for key, value in identifiers.items() - if value not in (None, "", [], {}, ()) - } - if compact: - return compact - - if source_type != "book": - return None - - compact = { - "book_id": source_id or translated_url.get("book_id"), - "isbn": translated_url.get("isbn"), - "eisbn": translated_url.get("eisbn"), - "doi": translated_url.get("doi"), - } - compact = { - key: value - for key, value in compact.items() - if value not in (None, "", [], {}, ()) - } - return compact or None - - -def _extract_book_id_from_pid(value): - if not value: - return None - normalized = str(value).upper() - if not normalized.startswith("BOOK:"): - return None - return normalized.split("BOOK:", 1)[1].split("/", 1)[0] or None - - -def _counter_access_type(source_access_type): - normalized = str(source_access_type or "").strip().lower() - if normalized == "commercial": - return "Controlled" - if normalized in {"free_to_read", "free-to-read", "free"}: - return "Free_To_Read" - return "Open" - - -def _safe_standardize(func, value, default=""): - try: - return func(value) - except Exception: - return default - - -def _standardize_pid_generic_list(values): - if not isinstance(values, (list, tuple, set)): - return [] - items = [] - for value in values: - item = _safe_standardize(standardize_pid_generic, value) - if item and item not in items: - items.append(item) - return items - - -def _build_document(item_access_data): - title = item_access_data.get("document_title") - if not title: - return {} - return {"title": title} - - -def _iter_access_targets(item_access_data): - return [ - { - "pid_generic": item_access_data.get("pid_generic"), - "document_type": item_access_data.get("document_type"), - } - ] - - -def _normalize_access_url(url): - if not url: - return None - parsed_url = urlparse(str(url).strip()) - path = ( - parsed_url.path if parsed_url.scheme or parsed_url.netloc else str(url).strip() - ) - path = unquote(path or "") - path = path.split("?", 1)[0].split("#", 1)[0].split()[0] - path = re.sub(r"/+", "/", path) - path = path.rstrip(".,;:") - return path or None - - -def _fallback_access_url_key(pid_generic, media_format, content_type): - return "|".join( - [ - str(pid_generic or ""), - str(media_format or ""), - str(content_type or ""), - ] - ) diff --git a/metrics/counter/access/__init__.py b/metrics/counter/access/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/counter/access/accumulation.py b/metrics/counter/access/accumulation.py new file mode 100644 index 0000000..bed2407 --- /dev/null +++ b/metrics/counter/access/accumulation.py @@ -0,0 +1,206 @@ +import re +from urllib.parse import unquote, urlparse + +from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour + + +def accumulate(results, counter_access, line): + access_url = counter_access.get("access_url") or _normalized_access_path( + line.get("url") + ) + counter_access = {**counter_access, "access_url": access_url} + + client_name = line.get("client_name") + client_version = line.get("client_version") + local_datetime = line.get("local_datetime") + ip_address = line.get("ip_address") + + access_datetime = truncate_datetime_to_hour(local_datetime) + ms_key = extract_minute_second_key(local_datetime) + if access_datetime is None or ms_key is None: + raise ValueError("Invalid local_datetime in parsed log line.") + + user_session_id = _generate_user_session_id( + client_name, + client_version, + ip_address, + access_datetime, + ) + raw_record = _build_record( + counter_access=counter_access, + line=line, + access_datetime=access_datetime, + minute_second_key=ms_key, + user_session_id=user_session_id, + ) + item_access_id = raw_record["id"] + + if item_access_id not in results: + results[item_access_id] = raw_record["data"] + + _increment_timestamp_count(results[item_access_id]["click_timestamps"], ms_key) + + access_url_key = access_url or "|".join( + [ + str(counter_access.get("pid_generic") or ""), + str(counter_access.get("media_format") or ""), + str(counter_access.get("content_type") or ""), + ] + ) + timestamps_by_url = results[item_access_id].setdefault( + "click_timestamps_by_url", {} + ) + url_timestamps = timestamps_by_url.setdefault(access_url_key, {}) + _increment_timestamp_count(url_timestamps, ms_key) + + +def _build_record( + counter_access, line, access_datetime, minute_second_key, user_session_id +): + collection = counter_access.get("collection") + source_key = _source_key(counter_access, collection) + pid_v2 = counter_access.get("pid_v2") + pid_v3 = counter_access.get("pid_v3") + pid_generic = counter_access.get("pid_generic") + media_format = counter_access.get("media_format") + content_language = counter_access.get("media_language") + content_type = counter_access.get("content_type") + access_country_code = line.get("country_code") + access_date = access_datetime.strftime("%Y-%m-%d") + + return { + "id": _generate_item_access_id( + user_session_id=user_session_id, + col_acron3=collection, + source_key=source_key, + pid_v2=pid_v2, + pid_v3=pid_v3, + pid_generic=pid_generic, + content_language=content_language, + access_country_code=access_country_code, + media_format=media_format, + content_type=content_type, + ), + "data": { + "collection": collection, + "source_key": source_key, + "document_type": counter_access.get("document_type"), + "pid_v2": pid_v2, + "pid_v3": pid_v3, + "pid_generic": pid_generic, + "document": _document_metadata(counter_access), + "title_pid_generic": counter_access.get("title_pid_generic") or pid_generic, + "user_session_id": user_session_id, + "click_timestamps": {minute_second_key: 0}, + "click_timestamps_by_url": {}, + "access_url": counter_access.get("access_url"), + "media_format": media_format, + "content_language": content_language, + "content_type": content_type, + "access_country_code": access_country_code, + "access_date": access_date, + "access_year": access_date[:4], + "access_month": access_date[:7].replace("-", ""), + "publication_year": counter_access.get("publication_year"), + "counter_access_type": counter_access.get("counter_access_type") or "Open", + "access_method": counter_access.get("access_method") or "Regular", + "source": _source_metadata(counter_access), + }, + } + + +def _increment_timestamp_count(timestamps, key): + if key not in timestamps: + timestamps[key] = 0 + timestamps[key] += 1 + + +def _normalized_access_path(url): + if not url: + return None + parsed_url = urlparse(str(url).strip()) + path = ( + parsed_url.path if parsed_url.scheme or parsed_url.netloc else str(url).strip() + ) + path = unquote(path or "") + path = path.split("?", 1)[0].split("#", 1)[0].split()[0] + path = re.sub(r"/+", "/", path) + path = path.rstrip(".,;:") + return path or None + + +def _generate_user_session_id( + client_name, client_version, ip_address, datetime, sep="|" +): + dt_year_month_day = datetime.strftime("%Y-%m-%d") + dt_hour = datetime.strftime("%H") + + return sep.join( + [ + str(client_name), + str(client_version), + str(ip_address), + str(dt_year_month_day), + str(dt_hour), + ] + ) + + +def _document_metadata(counter_access): + document_title = counter_access.get("document_title") + return {"title": document_title} if document_title else {} + + +def _source_metadata(counter_access): + return { + "source_type": counter_access.get("source_type"), + "source_id": counter_access.get("source_id"), + "scielo_issn": counter_access.get("scielo_issn"), + "main_title": counter_access.get("source_main_title"), + "identifiers": counter_access.get("source_identifiers"), + "access_type": counter_access.get("source_access_type"), + "city": counter_access.get("source_city"), + "country": counter_access.get("source_country"), + "subject_area_capes": counter_access.get("source_subject_area_capes"), + "subject_area_wos": counter_access.get("source_subject_area_wos"), + "acronym": counter_access.get("source_acronym"), + "publisher_name": counter_access.get("source_publisher_name"), + } + + +def _source_key(counter_access, fallback): + return ( + counter_access.get("source_id") + or counter_access.get("scielo_issn") + or counter_access.get("source_type") + or fallback + ) + + +def _generate_item_access_id( + col_acron3, + source_key, + pid_v2, + pid_v3, + pid_generic, + user_session_id, + access_country_code, + content_language, + media_format, + content_type, + sep="|", +): + return sep.join( + [ + col_acron3, + str(source_key or ""), + pid_v2 or "", + pid_v3 or "", + pid_generic or "", + str(user_session_id or ""), + str(access_country_code or ""), + str(content_language or ""), + str(media_format or ""), + str(content_type or ""), + ] + ) diff --git a/metrics/counter/access/extraction.py b/metrics/counter/access/extraction.py new file mode 100644 index 0000000..54ac429 --- /dev/null +++ b/metrics/counter/access/extraction.py @@ -0,0 +1,199 @@ +from scielo_usage_counter.values import DEFAULT_SCIELO_ISSN + +from core.utils.standardizer import ( + standardize_language_code, + standardize_or_default, + standardize_pid_generic, + standardize_pid_generic_values, + standardize_pid_v2, + standardize_pid_v3, + standardize_year_of_publication, +) + + +def extract(collection_acron3, translated_url): + if not translated_url or not isinstance(translated_url, dict): + return {} + + source_type = _resolve_source_type(collection_acron3, translated_url) + source_id = _resolve_source_id(translated_url, source_type) + scielo_issn = _resolve_scielo_issn(translated_url, source_type, source_id) + document_type = _resolve_document_type( + collection_acron3, translated_url, source_type + ) + publication_year = standardize_or_default( + standardize_year_of_publication, + translated_url.get("year_of_publication"), + ) + source_access_type = translated_url.get("source_access_type") + + return { + "collection": collection_acron3, + "source_type": source_type, + "source_id": source_id, + "scielo_issn": scielo_issn, + "document_type": document_type, + "document_title": _resolve_document_title(document_type, translated_url), + "pid_v2": standardize_or_default( + standardize_pid_v2, + translated_url.get("pid_v2"), + ), + "pid_v3": standardize_or_default( + standardize_pid_v3, + translated_url.get("pid_v3"), + ), + "pid_generic": standardize_or_default( + standardize_pid_generic, + translated_url.get("pid_generic"), + ), + "title_pid_generic": standardize_or_default( + standardize_pid_generic, + translated_url.get("title_pid_generic"), + ), + "segment_pid_generics": standardize_pid_generic_values( + translated_url.get("segment_pid_generics"), + ), + "media_language": standardize_or_default( + standardize_language_code, + translated_url.get("media_language"), + default="un", + ), + "media_format": translated_url.get("media_format"), + "content_type": translated_url.get("content_type"), + "access_url": translated_url.get("access_url") + or translated_url.get("normalized_url"), + "publication_year": publication_year, + "counter_access_type": _resolve_counter_access_type(source_access_type), + "access_method": "Regular", + "source_main_title": ( + translated_url.get("source_main_title") + or translated_url.get("journal_main_title") + or translated_url.get("book_title") + ), + "source_subject_area_capes": translated_url.get("source_subject_area_capes") + or translated_url.get("journal_subject_area_capes"), + "source_subject_area_wos": translated_url.get("source_subject_area_wos") + or translated_url.get("journal_subject_area_wos"), + "source_acronym": translated_url.get("source_acronym") + or translated_url.get("journal_acronym"), + "source_publisher_name": translated_url.get("source_publisher_name") + or translated_url.get("journal_publisher_name"), + "source_access_type": source_access_type, + "source_identifiers": _resolve_source_identifiers(translated_url), + "source_city": translated_url.get("source_city"), + "source_country": translated_url.get("source_country"), + } + + +def _resolve_document_title(document_type, translated_url): + if document_type == "chapter": + return translated_url.get("chapter_title") + + if document_type == "book": + return translated_url.get("book_title") + + return ( + translated_url.get("document_title") + or translated_url.get("article_title") + or translated_url.get("title") + ) + + +def _resolve_counter_access_type(source_access_type): + normalized_access_type = str(source_access_type or "").strip().lower() + if normalized_access_type == "commercial": + return "Controlled" + + if normalized_access_type in {"free_to_read", "free-to-read", "free"}: + return "Free_To_Read" + + return "Open" + + +def _resolve_source_type(collection_acron3, translated_url): + source_type = translated_url.get("source_type") + if source_type: + return source_type + + if collection_acron3 == "preprints": + return "preprint_server" + + if collection_acron3 == "data": + return "data_repository" + + if ( + translated_url.get("scielo_issn") + and translated_url.get("scielo_issn") != DEFAULT_SCIELO_ISSN + ): + return "journal" + + if translated_url.get("journal_acronym") or translated_url.get( + "journal_main_title" + ): + return "journal" + + return "other" + + +def _resolve_source_id(translated_url, source_type): + source_id = translated_url.get("source_id") + if source_id: + return source_id + + if source_type == "preprint_server": + return translated_url.get("preprint_server_id") or "scielo-preprints" + + if source_type == "data_repository": + return translated_url.get("repository_id") or "scielo-data" + + if source_type == "journal": + return translated_url.get("scielo_issn") + + return None + + +def _resolve_scielo_issn(translated_url, source_type, source_id): + scielo_issn = translated_url.get("scielo_issn") + if scielo_issn: + return scielo_issn + + if source_type == "journal" and source_id: + return source_id + + if source_type in {"book", "other"}: + return DEFAULT_SCIELO_ISSN + + return None + + +def _resolve_document_type(collection_acron3, translated_url, source_type): + document_type = translated_url.get("document_type") + if document_type: + return document_type + + if collection_acron3 == "preprints": + return "preprint" + + if collection_acron3 == "data": + return "dataset" + + if source_type == "journal": + return "article" + + return "article" + + +def _resolve_source_identifiers(translated_url): + identifiers = translated_url.get("source_identifiers") + if isinstance(identifiers, dict): + return _compact_identifiers(identifiers) + return None + + +def _compact_identifiers(identifiers): + compact = { + key: value + for key, value in identifiers.items() + if value not in (None, "", [], {}, ()) + } + return compact or None diff --git a/metrics/counter/access/validation.py b/metrics/counter/access/validation.py new file mode 100644 index 0000000..673b6c1 --- /dev/null +++ b/metrics/counter/access/validation.py @@ -0,0 +1,113 @@ +from scielo_usage_counter.values import ( + CONTENT_TYPE_UNDEFINED, + DEFAULT_SCIELO_ISSN, + MEDIA_FORMAT_UNDEFINED, + MEDIA_LANGUAGE_UNDEFINED, +) + + +def is_valid(data, utm=None, ignore_utm_validation=False): + if not isinstance(data, dict): + return False, { + "message": "Invalid data format. Expected a dictionary.", + "code": "invalid_format", + } + + scielo_issn = data.get("scielo_issn") + source_id = data.get("source_id") + source_type = data.get("source_type") + document_type = data.get("document_type") or "article" + media_format = data.get("media_format") + media_language = data.get("media_language") + content_type = data.get("content_type") + pid_v2 = data.get("pid_v2") + pid_v3 = data.get("pid_v3") + pid_generic = data.get("pid_generic") + has_source_identity = bool(source_id) or bool( + scielo_issn and scielo_issn != DEFAULT_SCIELO_ISSN + ) + has_media_language = bool( + media_language and media_language != MEDIA_LANGUAGE_UNDEFINED + ) + has_pid = bool(pid_v2 or pid_v3 or pid_generic) + + if not all( + [ + media_format and media_format != MEDIA_FORMAT_UNDEFINED, + content_type and content_type != CONTENT_TYPE_UNDEFINED, + has_pid, + ] + ): + return False, { + "message": "Missing required fields in item access data.", + "code": "missing_fields", + } + + if document_type in {"article", "book", "chapter"} and not has_media_language: + return False, { + "message": "Missing media language in item access data.", + "code": "missing_fields", + } + + if document_type == "article" and not has_source_identity: + return False, { + "message": "Missing article source identity.", + "code": "missing_fields", + } + + if document_type in {"book", "chapter"} and not source_id: + return False, { + "message": "Missing book source identity.", + "code": "missing_fields", + } + + if document_type in {"preprint", "dataset"} and not pid_generic: + return False, { + "message": "Missing generic PID in item access data.", + "code": "missing_fields", + } + + if utm and not ignore_utm_validation: + if ( + source_type == "journal" + and scielo_issn + and scielo_issn != DEFAULT_SCIELO_ISSN + and not utm.is_valid_code(scielo_issn, utm.sources_metadata["issn_set"]) + ): + return False, { + "message": f"Invalid scielo_issn: {scielo_issn}", + "code": "invalid_scielo_issn", + } + + if ( + source_type + and source_type != "journal" + and source_id + and source_id not in utm.sources_metadata.get("source_id_to_type", {}) + ): + return False, { + "message": f"Invalid source_id: {source_id}", + "code": "invalid_source_id", + } + + if pid_v2 and not utm.is_valid_code(pid_v2, utm.documents_metadata["pid_set"]): + return False, { + "message": f"Invalid pid_v2: {pid_v2}", + "code": "invalid_pid_v2", + } + + if pid_v3 and not utm.is_valid_code(pid_v3, utm.documents_metadata["pid_set"]): + return False, { + "message": f"Invalid pid_v3: {pid_v3}", + "code": "invalid_pid_v3", + } + + if pid_generic and not utm.is_valid_code( + pid_generic, utm.documents_metadata["pid_set"] + ): + return False, { + "message": f"Invalid pid_generic: {pid_generic}", + "code": "invalid_pid_generic", + } + + return True, {"message": "Item access data is valid.", "code": "valid"} diff --git a/metrics/counter/aggregation.py b/metrics/counter/aggregation.py deleted file mode 100644 index d047e7a..0000000 --- a/metrics/counter/aggregation.py +++ /dev/null @@ -1,124 +0,0 @@ -from scielo_usage_counter.counter import get_valid_clicks, is_request - - -def apply_unique_metrics( - document, - unique_state, - scope, - document_id, - user_session_id, - is_request_event, -): - if not user_session_id: - return - - inv_bucket = unique_state[f"{scope}_investigations"] - inv_key = (document_id, user_session_id) - add_investigation = inv_key not in inv_bucket - if add_investigation: - inv_bucket.add(inv_key) - - add_request = False - if is_request_event: - req_bucket = unique_state[f"{scope}_requests"] - req_key = (document_id, user_session_id) - add_request = req_key not in req_bucket - if add_request: - req_bucket.add(req_key) - - increment_document_uniques( - document=document, - add_investigation=add_investigation, - add_request=add_request, - ) - - -def increment_document_totals(document, click_timestamps, content_type, click_timestamps_by_url=None): - number_of_clicks = _count_valid_clicks( - click_timestamps=click_timestamps, - click_timestamps_by_url=click_timestamps_by_url, - ) - - document["total_investigations"] += number_of_clicks - if is_request(content_type): - document["total_requests"] += number_of_clicks - - if "daily_metrics" in document: - day_key = list(document["daily_metrics"].keys())[0] - document["daily_metrics"][day_key]["total_investigations"] += number_of_clicks - if is_request(content_type): - document["daily_metrics"][day_key]["total_requests"] += number_of_clicks - - -def _count_valid_clicks(click_timestamps, click_timestamps_by_url=None): - if isinstance(click_timestamps_by_url, dict) and click_timestamps_by_url: - return sum( - get_valid_clicks(timestamps or {}) - for timestamps in click_timestamps_by_url.values() - ) - return get_valid_clicks(click_timestamps or {}) - - -def increment_document_uniques(document, add_investigation=False, add_request=False): - if add_investigation: - document["unique_investigations"] += 1 - if add_request: - document["unique_requests"] += 1 - - if "daily_metrics" in document: - day_key = list(document["daily_metrics"].keys())[0] - if add_investigation: - document["daily_metrics"][day_key]["unique_investigations"] += 1 - if add_request: - document["daily_metrics"][day_key]["unique_requests"] += 1 - - -def counter_data_type(document_type): - if document_type == "dataset": - return "Dataset" - if document_type in {"article", "preprint"}: - return "Article" - if document_type == "book": - return "Book" - if document_type == "chapter": - return "Book_Segment" - return "Other" - - -def parent_data_type(document_type, source_type=None): - if document_type == "chapter": - return "Book" - if document_type == "article" and source_type == "journal": - return "Journal" - return None - - -def article_version(document_type): - if document_type == "preprint": - return "Preprint" - return None - - -def should_create_book_item_document(value): - if not value.get("pid_generic"): - return False - if value.get("document_type") == "book" and not is_request(value.get("content_type")): - return False - return True - - -def extract_title_pid_generic(value, fallback=None): - title_pid_generic = value.get("title_pid_generic") - if title_pid_generic: - return title_pid_generic - - pid_generic = value.get("pid_generic") - if "/CHAPTER:" in (pid_generic or "").upper(): - return pid_generic.upper().split("/CHAPTER:")[0] - - source = value.get("source") or {} - source_id = source.get("source_id") - if source_id: - return f"BOOK:{str(source_id).upper()}" - - return fallback diff --git a/metrics/counter/documents.py b/metrics/counter/documents.py deleted file mode 100644 index e13c0cf..0000000 --- a/metrics/counter/documents.py +++ /dev/null @@ -1,426 +0,0 @@ -from scielo_usage_counter.counter import is_request - -from metrics.counter.aggregation import ( - apply_unique_metrics, - article_version, - counter_data_type, - extract_title_pid_generic, - increment_document_totals, - parent_data_type, - should_create_book_item_document, -) -from metrics.counter.identifiers import ( - generate_month_document_id, - generate_year_document_id, -) - - -def convert_to_month_index_documents(data: dict): - if not isinstance(data, dict): - return {} - - metrics_data = {} - unique_state = _initialize_unique_state() - - for value in data.values(): - _accumulate_documents( - data=metrics_data, - unique_state=unique_state, - value=value, - granularity="month", - ) - - return metrics_data - - -def convert_to_year_index_documents(data: dict): - if not isinstance(data, dict): - return {} - - metrics_data = {} - unique_state = _initialize_unique_state() - - for value in data.values(): - _accumulate_documents( - data=metrics_data, - unique_state=unique_state, - value=value, - granularity="year", - ) - - return metrics_data - - -def convert_raw_results_to_index_documents(data: dict): - return { - "month": convert_to_month_index_documents(data), - "year": convert_to_year_index_documents(data), - } - - -def _initialize_unique_state(): - return { - "item_investigations": set(), - "item_requests": set(), - "title_investigations": set(), - "title_requests": set(), - } - - -def _accumulate_documents(data, unique_state, value, granularity): - if not isinstance(value, dict): - return - - if value.get("collection") == "books": - _accumulate_books_documents(data, unique_state, value, granularity) - return - - _accumulate_standard_documents(data, unique_state, value, granularity) - - -def _accumulate_standard_documents(data, unique_state, value, granularity): - document_id = _generate_document_id(value, granularity) - document = data.setdefault( - document_id, - _build_base_document(value=value, granularity=granularity), - ) - - increment_document_totals( - document=document, - click_timestamps=value.get("click_timestamps"), - click_timestamps_by_url=value.get("click_timestamps_by_url"), - content_type=value.get("content_type"), - ) - apply_unique_metrics( - document=document, - unique_state=unique_state, - scope="item", - document_id=document_id, - user_session_id=value.get("user_session_id"), - is_request_event=is_request(value.get("content_type")), - ) - - -def _accumulate_books_documents(data, unique_state, value, granularity): - if should_create_book_item_document(value): - item_document_id = _generate_document_id( - value, - granularity, - metric_scope="item", - ) - item_document = data.setdefault( - item_document_id, - _build_base_document( - value=value, - granularity=granularity, - metric_scope="item", - ), - ) - increment_document_totals( - document=item_document, - click_timestamps=value.get("click_timestamps"), - click_timestamps_by_url=value.get("click_timestamps_by_url"), - content_type=value.get("content_type"), - ) - apply_unique_metrics( - document=item_document, - unique_state=unique_state, - scope="item", - document_id=item_document_id, - user_session_id=value.get("user_session_id"), - is_request_event=is_request(value.get("content_type")), - ) - - title_pid_generic = extract_title_pid_generic(value) - if not title_pid_generic: - return - - title_document_id = _generate_document_id( - value, - granularity, - metric_scope="title", - pid_generic=title_pid_generic, - ) - title_document = data.setdefault( - title_document_id, - _build_base_document( - value=value, - granularity=granularity, - metric_scope="title", - pid_generic=title_pid_generic, - document_type="book", - ), - ) - increment_document_totals( - document=title_document, - click_timestamps=value.get("click_timestamps"), - click_timestamps_by_url=value.get("click_timestamps_by_url"), - content_type=value.get("content_type"), - ) - apply_unique_metrics( - document=title_document, - unique_state=unique_state, - scope="title", - document_id=title_document_id, - user_session_id=value.get("user_session_id"), - is_request_event=is_request(value.get("content_type")), - ) - - -def _generate_document_id(value, granularity, metric_scope=None, pid_generic=None): - pid_generic = pid_generic or value.get("pid_generic") - publication_year = str(value.get("publication_year") or "0001") - if granularity == "month": - access_month = ( - value.get("access_date", "")[:7] if value.get("access_date") else "" - ) - return generate_month_document_id( - collection=value.get("collection"), - source_key=value.get("source_key"), - pid_v2=value.get("pid_v2"), - pid_v3=value.get("pid_v3"), - pid_generic=pid_generic, - access_month=access_month, - counter_access_type=value.get("counter_access_type") or "Open", - access_method=value.get("access_method") or "Regular", - publication_year=publication_year, - metric_scope="title" if metric_scope == "title" else None, - ) - - return generate_year_document_id( - collection=value.get("collection"), - source_key=value.get("source_key"), - pid_v2=value.get("pid_v2"), - pid_v3=value.get("pid_v3"), - pid_generic=pid_generic, - content_language=value.get("content_language"), - access_country_code=value.get("access_country_code"), - access_year=value.get("access_year"), - counter_access_type=value.get("counter_access_type") or "Open", - access_method=value.get("access_method") or "Regular", - publication_year=publication_year, - metric_scope="title" if metric_scope == "title" else None, - ) - - -def _build_base_document( - value, granularity, metric_scope=None, pid_generic=None, document_type=None -): - collection = value.get("collection") - scope = metric_scope or "item" - if collection == "books": - document_id = pid_generic or value.get("pid_generic") - parent_id = extract_title_pid_generic(value, fallback=document_id) - if parent_id == document_id or scope == "title": - parent_id = None - raw_source = value.get("source") or {} - source = _build_source(raw_source) - base_document = { - "collection": collection, - "source": source, - "document": _build_document( - value=value, - document_id=document_id, - document_type=document_type or value.get("document_type"), - parent_id=parent_id, - source_identifiers=raw_source.get("identifiers"), - metric_scope=scope, - ), - "counter": _compact_dict( - { - "metric_scope": scope, - "data_type": "Book" if scope == "title" else "Book_Segment", - "parent_data_type": "Book" if scope != "title" else None, - "access_type": value.get("counter_access_type") or "Open", - "access_method": value.get("access_method") or "Regular", - } - ), - "total_requests": 0, - "total_investigations": 0, - "unique_requests": 0, - "unique_investigations": 0, - } - base_document["access"] = _build_access(value, granularity) - if granularity == "month": - base_document["daily_metrics"] = _build_daily_metrics(value) - return base_document - - document_type = value.get("document_type") - document_id = value.get("pid_v3") or value.get("pid_v2") or value.get("pid_generic") - base_document = { - "collection": collection, - "source": _build_source(value.get("source")), - "document": _build_document( - value=value, - document_id=document_id, - document_type=document_type, - ), - "counter": _compact_dict( - { - "metric_scope": "item", - "data_type": counter_data_type(document_type), - "parent_data_type": parent_data_type( - document_type, - (value.get("source") or {}).get("source_type"), - ), - "article_version": article_version(document_type), - "access_type": value.get("counter_access_type") or "Open", - "access_method": value.get("access_method") or "Regular", - } - ), - "total_requests": 0, - "total_investigations": 0, - "unique_requests": 0, - "unique_investigations": 0, - } - base_document["access"] = _build_access(value, granularity) - if granularity == "month": - base_document["daily_metrics"] = _build_daily_metrics(value) - return base_document - - -def _build_access(value, granularity): - if granularity == "month": - return { - "month": value.get("access_date", "")[:7] - if value.get("access_date") - else "" - } - - return _compact_dict( - { - "year": value.get("access_year"), - "country_code": value.get("access_country_code"), - "content_language": value.get("content_language"), - } - ) - - -def _build_daily_metrics(value): - day = value.get("access_date", "")[-2:] if value.get("access_date") else "01" - return { - day: { - "total_requests": 0, - "total_investigations": 0, - "unique_requests": 0, - "unique_investigations": 0, - } - } - - -def _build_document( - value, - document_id, - document_type, - parent_id=None, - source_identifiers=None, - metric_scope="item", -): - document = value.get("document") or {} - title = document.get("title") - if metric_scope == "title": - title = (value.get("source") or {}).get("main_title") or title - - identifiers = _document_identifiers( - value=value, - document_id=document_id, - source_identifiers=source_identifiers, - metric_scope=metric_scope, - ) - - return _compact_dict( - { - "id": document_id, - "type": document_type, - "title": title, - "parent_id": parent_id, - "publication_year": value.get("publication_year"), - "identifiers": identifiers, - } - ) - - -def _document_identifiers( - value, document_id, source_identifiers=None, metric_scope="item" -): - if value.get("collection") == "books" and metric_scope == "title": - identifiers = _book_identifiers_from_pid(document_id) - identifiers.update(source_identifiers or {}) - return _compact_identifiers(identifiers, canonical_id=document_id) - - document_identifiers = (value.get("document") or {}).get("identifiers") or {} - identifiers = { - "pid_v2": value.get("pid_v2"), - "pid_v3": value.get("pid_v3"), - "pid_generic": value.get("pid_generic"), - } - identifiers.update(document_identifiers) - - if value.get("collection") == "books": - identifiers.update(_book_identifiers_from_pid(value.get("pid_generic"))) - identifiers.update(source_identifiers or {}) - - return _compact_identifiers(identifiers, canonical_id=document_id) - - -def _book_identifiers_from_pid(pid_generic): - value = str(pid_generic or "") - if not value.upper().startswith("BOOK:"): - return {} - - identifiers = {} - parts = value.split("/", 1) - book_id = parts[0].split(":", 1)[1] if ":" in parts[0] else "" - if book_id: - identifiers["book_id"] = book_id - - if len(parts) > 1 and parts[1].upper().startswith("CHAPTER:"): - chapter_id = parts[1].split(":", 1)[1] if ":" in parts[1] else "" - if chapter_id: - identifiers["chapter_id"] = chapter_id - - return identifiers - - -def _build_source(source): - source = source or {} - source_id = source.get("source_id") - source_type = source.get("source_type") - identifiers = _compact_identifiers( - source.get("identifiers") or {}, canonical_id=source_id - ) - - return _compact_dict( - { - "id": source_id, - "type": source_type, - "title": source.get("main_title"), - "scielo_issn": None if source_type == "book" else source.get("scielo_issn"), - "acronym": source.get("acronym"), - "publisher_name": source.get("publisher_name"), - "subject_area_capes": source.get("subject_area_capes"), - "subject_area_wos": source.get("subject_area_wos"), - "access_type": source.get("access_type"), - "city": source.get("city"), - "country": source.get("country"), - "identifiers": identifiers, - } - ) - - -def _compact_identifiers(identifiers, canonical_id=None): - compact = {} - canonical_value = str(canonical_id or "").strip().upper() - for key, value in (identifiers or {}).items(): - if value in (None, "", [], {}, ()): - continue - if canonical_value and str(value).strip().upper() == canonical_value: - continue - compact[key] = value - return compact - - -def _compact_dict(data): - return { - key: value for key, value in data.items() if value not in (None, "", [], {}, ()) - } diff --git a/metrics/counter/identifiers.py b/metrics/counter/identifiers.py deleted file mode 100644 index bef7b8d..0000000 --- a/metrics/counter/identifiers.py +++ /dev/null @@ -1,110 +0,0 @@ -def generate_user_session_id(client_name, client_version, ip_address, datetime, sep="|"): - dt_year_month_day = datetime.strftime("%Y-%m-%d") - dt_hour = datetime.strftime("%H") - - return sep.join( - [ - str(client_name), - str(client_version), - str(ip_address), - str(dt_year_month_day), - str(dt_hour), - ] - ) - - -def generate_item_access_id( - col_acron3, - source_key, - pid_v2, - pid_v3, - pid_generic, - user_session_id, - access_country_code, - content_language, - media_format, - content_type, - sep="|", -): - return sep.join( - [ - col_acron3, - str(source_key or ""), - pid_v2 or "", - pid_v3 or "", - pid_generic or "", - str(user_session_id or ""), - str(access_country_code or ""), - str(content_language or ""), - str(media_format or ""), - str(content_type or ""), - ] - ) - - -def generate_month_document_id( - collection: str, - source_key: str, - pid_v2: str, - pid_v3: str, - pid_generic: str, - access_month: str, - counter_access_type: str, - access_method: str, - publication_year: str, - metric_scope: str = None, -) -> str: - parts = [] - if metric_scope: - parts.append(metric_scope) - - parts.extend( - [ - str(collection or ""), - str(source_key or ""), - pid_v2 or "", - pid_v3 or "", - pid_generic or "", - str(access_month or ""), - str(counter_access_type or ""), - str(access_method or ""), - str(publication_year or ""), - ] - ) - return "|".join(parts) - - -def generate_year_document_id( - collection: str, - source_key: str, - pid_v2: str, - pid_v3: str, - pid_generic: str, - content_language: str, - access_country_code: str, - access_year: str, - counter_access_type: str, - access_method: str, - publication_year: str, - metric_scope: str = None, -) -> str: - parts = [] - if metric_scope: - parts.append(metric_scope) - - parts.extend( - [ - str(collection or ""), - str(source_key or ""), - pid_v2 or "", - pid_v3 or "", - pid_generic or "", - content_language or "", - access_country_code or "", - str(access_year or ""), - str(counter_access_type or ""), - str(access_method or ""), - str(publication_year or ""), - ] - ) - return "|".join(parts) diff --git a/metrics/counter/indexing/__init__.py b/metrics/counter/indexing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/counter/indexing/converter.py b/metrics/counter/indexing/converter.py new file mode 100644 index 0000000..4b4ab1f --- /dev/null +++ b/metrics/counter/indexing/converter.py @@ -0,0 +1,58 @@ +from metrics.counter.indexing.engines.article import ArticlePipeline +from metrics.counter.indexing.engines.base import DocumentPipeline +from metrics.counter.indexing.engines.book import BookPipeline +from metrics.counter.indexing.engines.dataset import DatasetPipeline +from metrics.counter.indexing.engines.preprint import PreprintPipeline + +_PIPELINES = { + "article": ArticlePipeline(), + "preprint": PreprintPipeline(), + "dataset": DatasetPipeline(), + "book": BookPipeline(), + "chapter": BookPipeline(), +} +_DEFAULT = DocumentPipeline() + + +def convert(data): + if not isinstance(data, dict): + return {"month": {}, "year": {}} + + month_data = {} + month_unique_state = _initialize_unique_state() + year_data = {} + year_unique_state = _initialize_unique_state() + + for value in data.values(): + pipeline = _get_pipeline(value) + pipeline.accumulate( + data=month_data, + unique_state=month_unique_state, + value=value, + granularity="month", + ) + pipeline.accumulate( + data=year_data, + unique_state=year_unique_state, + value=value, + granularity="year", + ) + + return {"month": month_data, "year": year_data} + + +def _get_pipeline(value): + collection = value.get("collection") + if collection == "books": + return _PIPELINES["book"] + + return _PIPELINES.get(value.get("document_type"), _DEFAULT) + + +def _initialize_unique_state(): + return { + "item_investigations": set(), + "item_requests": set(), + "title_investigations": set(), + "title_requests": set(), + } diff --git a/metrics/counter/indexing/engines/__init__.py b/metrics/counter/indexing/engines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/counter/indexing/engines/article.py b/metrics/counter/indexing/engines/article.py new file mode 100644 index 0000000..14114dc --- /dev/null +++ b/metrics/counter/indexing/engines/article.py @@ -0,0 +1,11 @@ +from metrics.counter.indexing.engines.base import DocumentPipeline + + +class ArticlePipeline(DocumentPipeline): + data_type = "Article" + + def _resolve_parent_data_type(self, value): + source_type = (value.get("source") or {}).get("source_type") + if source_type == "journal": + return "Journal" + return None diff --git a/metrics/counter/indexing/engines/base.py b/metrics/counter/indexing/engines/base.py new file mode 100644 index 0000000..902cf83 --- /dev/null +++ b/metrics/counter/indexing/engines/base.py @@ -0,0 +1,371 @@ +from scielo_usage_counter.counter import get_valid_clicks, is_request + + +class DocumentPipeline: + data_type = "Other" + + def accumulate(self, data, unique_state, value, granularity): + if not isinstance(value, dict): + return + + document_id = self._generate_document_id(value, granularity) + document = data.setdefault( + document_id, + self._build_document(value=value, granularity=granularity), + ) + + self._apply_totals( + document=document, + click_timestamps=value.get("click_timestamps"), + click_timestamps_by_url=value.get("click_timestamps_by_url"), + content_type=value.get("content_type"), + ) + self._apply_uniques( + document=document, + unique_state=unique_state, + scope="item", + document_id=document_id, + user_session_id=value.get("user_session_id"), + is_request_event=is_request(value.get("content_type")), + ) + + def _generate_document_id( + self, value, granularity, metric_scope=None, pid_generic=None + ): + pid_generic = pid_generic or value.get("pid_generic") + publication_year = str(value.get("publication_year") or "0001") + if granularity == "month": + access_month = ( + value.get("access_date", "")[:7] if value.get("access_date") else "" + ) + return _generate_month_document_id( + collection=value.get("collection"), + source_key=value.get("source_key"), + pid_v2=value.get("pid_v2"), + pid_v3=value.get("pid_v3"), + pid_generic=pid_generic, + access_month=access_month, + counter_access_type=value.get("counter_access_type") or "Open", + access_method=value.get("access_method") or "Regular", + publication_year=publication_year, + metric_scope="title" if metric_scope == "title" else None, + ) + + return _generate_year_document_id( + collection=value.get("collection"), + source_key=value.get("source_key"), + pid_v2=value.get("pid_v2"), + pid_v3=value.get("pid_v3"), + pid_generic=pid_generic, + content_language=value.get("content_language"), + access_country_code=value.get("access_country_code"), + access_year=value.get("access_year"), + counter_access_type=value.get("counter_access_type") or "Open", + access_method=value.get("access_method") or "Regular", + publication_year=publication_year, + metric_scope="title" if metric_scope == "title" else None, + ) + + def _build_document(self, value, granularity, **kwargs): + document_type = value.get("document_type") + document_id = self._resolve_document_id(value) + + base_document = { + "collection": value.get("collection"), + "source": self._build_source(value.get("source")), + "document": self._build_document_section( + value=value, + document_id=document_id, + document_type=document_type, + ), + "counter": self._build_counter_section(value), + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + + base_document["access"] = self._build_access(value, granularity) + if granularity == "month": + base_document["daily_metrics"] = self._build_daily_metrics(value) + return base_document + + def _resolve_document_id(self, value): + return value.get("pid_v3") or value.get("pid_v2") or value.get("pid_generic") + + def _resolve_parent_data_type(self, value): + return None + + def _build_counter_section(self, value): + return _strip_empty_values( + { + "metric_scope": "item", + "data_type": self.data_type, + "parent_data_type": self._resolve_parent_data_type(value), + "access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + } + ) + + def _build_document_section( + self, + value, + document_id, + document_type, + parent_id=None, + source_identifiers=None, + metric_scope="item", + ): + document = value.get("document") or {} + title = document.get("title") + if metric_scope == "title": + title = (value.get("source") or {}).get("main_title") or title + + identifiers = self._document_identifiers( + value=value, + document_id=document_id, + source_identifiers=source_identifiers, + metric_scope=metric_scope, + ) + + return _strip_empty_values( + { + "id": document_id, + "type": document_type, + "title": title, + "parent_id": parent_id, + "publication_year": value.get("publication_year"), + "identifiers": identifiers, + } + ) + + def _document_identifiers( + self, value, document_id, source_identifiers=None, metric_scope="item" + ): + document_identifiers = (value.get("document") or {}).get("identifiers") or {} + identifiers = { + "pid_v2": value.get("pid_v2"), + "pid_v3": value.get("pid_v3"), + "pid_generic": value.get("pid_generic"), + } + identifiers.update(document_identifiers) + return _strip_empty_identifiers(identifiers, canonical_id=document_id) + + @staticmethod + def _build_source(source): + source = source or {} + source_id = source.get("source_id") + source_type = source.get("source_type") + identifiers = _strip_empty_identifiers( + source.get("identifiers") or {}, canonical_id=source_id + ) + + return _strip_empty_values( + { + "id": source_id, + "type": source_type, + "title": source.get("main_title"), + "scielo_issn": None + if source_type == "book" + else source.get("scielo_issn"), + "acronym": source.get("acronym"), + "publisher_name": source.get("publisher_name"), + "subject_area_capes": source.get("subject_area_capes"), + "subject_area_wos": source.get("subject_area_wos"), + "access_type": source.get("access_type"), + "city": source.get("city"), + "country": source.get("country"), + "identifiers": identifiers, + } + ) + + @staticmethod + def _build_access(value, granularity): + if granularity == "month": + return { + "month": value.get("access_date", "")[:7] + if value.get("access_date") + else "" + } + + return _strip_empty_values( + { + "year": value.get("access_year"), + "country_code": value.get("access_country_code"), + "content_language": value.get("content_language"), + } + ) + + @staticmethod + def _build_daily_metrics(value): + day = value.get("access_date", "")[-2:] if value.get("access_date") else "01" + return { + day: { + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + } + + @staticmethod + def _apply_totals( + document, click_timestamps, content_type, click_timestamps_by_url=None + ): + number_of_clicks = _count_valid_clicks( + click_timestamps=click_timestamps, + click_timestamps_by_url=click_timestamps_by_url, + ) + + document["total_investigations"] += number_of_clicks + if is_request(content_type): + document["total_requests"] += number_of_clicks + + if "daily_metrics" in document: + day_key = list(document["daily_metrics"].keys())[0] + document["daily_metrics"][day_key][ + "total_investigations" + ] += number_of_clicks + if is_request(content_type): + document["daily_metrics"][day_key]["total_requests"] += number_of_clicks + + @staticmethod + def _apply_uniques( + document, + unique_state, + scope, + document_id, + user_session_id, + is_request_event, + ): + if not user_session_id: + return + + inv_bucket = unique_state[f"{scope}_investigations"] + inv_key = (document_id, user_session_id) + add_investigation = inv_key not in inv_bucket + if add_investigation: + inv_bucket.add(inv_key) + + add_request = False + if is_request_event: + req_bucket = unique_state[f"{scope}_requests"] + req_key = (document_id, user_session_id) + add_request = req_key not in req_bucket + if add_request: + req_bucket.add(req_key) + + _increment_document_uniques( + document=document, + add_investigation=add_investigation, + add_request=add_request, + ) + + +def _increment_document_uniques(document, add_investigation=False, add_request=False): + if add_investigation: + document["unique_investigations"] += 1 + if add_request: + document["unique_requests"] += 1 + + if "daily_metrics" in document: + day_key = list(document["daily_metrics"].keys())[0] + if add_investigation: + document["daily_metrics"][day_key]["unique_investigations"] += 1 + if add_request: + document["daily_metrics"][day_key]["unique_requests"] += 1 + + +def _count_valid_clicks(click_timestamps, click_timestamps_by_url=None): + if isinstance(click_timestamps_by_url, dict) and click_timestamps_by_url: + return sum( + get_valid_clicks(timestamps or {}) + for timestamps in click_timestamps_by_url.values() + ) + return get_valid_clicks(click_timestamps or {}) + + +def _strip_empty_identifiers(identifiers, canonical_id=None): + compact = {} + canonical_value = str(canonical_id or "").strip().upper() + for key, value in (identifiers or {}).items(): + if value in (None, "", [], {}, ()): + continue + if canonical_value and str(value).strip().upper() == canonical_value: + continue + compact[key] = value + return compact + + +def _strip_empty_values(data): + return { + key: value for key, value in data.items() if value not in (None, "", [], {}, ()) + } + + +def _generate_month_document_id( + collection, + source_key, + pid_v2, + pid_v3, + pid_generic, + access_month, + counter_access_type, + access_method, + publication_year, + metric_scope=None, +): + parts = [] + if metric_scope: + parts.append(metric_scope) + + parts.extend( + [ + str(collection or ""), + str(source_key or ""), + pid_v2 or "", + pid_v3 or "", + pid_generic or "", + str(access_month or ""), + str(counter_access_type or ""), + str(access_method or ""), + str(publication_year or ""), + ] + ) + return "|".join(parts) + + +def _generate_year_document_id( + collection, + source_key, + pid_v2, + pid_v3, + pid_generic, + content_language, + access_country_code, + access_year, + counter_access_type, + access_method, + publication_year, + metric_scope=None, +): + parts = [] + if metric_scope: + parts.append(metric_scope) + + parts.extend( + [ + str(collection or ""), + str(source_key or ""), + pid_v2 or "", + pid_v3 or "", + pid_generic or "", + content_language or "", + access_country_code or "", + str(access_year or ""), + str(counter_access_type or ""), + str(access_method or ""), + str(publication_year or ""), + ] + ) + return "|".join(parts) diff --git a/metrics/counter/indexing/engines/book.py b/metrics/counter/indexing/engines/book.py new file mode 100644 index 0000000..0ec3bd2 --- /dev/null +++ b/metrics/counter/indexing/engines/book.py @@ -0,0 +1,195 @@ +from scielo_usage_counter.counter import is_request + +from metrics.counter.indexing.engines.base import ( + DocumentPipeline, + _strip_empty_identifiers, + _strip_empty_values, +) + + +class BookPipeline(DocumentPipeline): + def accumulate(self, data, unique_state, value, granularity): + if not isinstance(value, dict): + return + + if _should_create_item_document(value): + self._accumulate_item(data, unique_state, value, granularity) + + title_pid_generic = _extract_title_pid_generic(value) + if not title_pid_generic: + return + + self._accumulate_title( + data, unique_state, value, granularity, title_pid_generic + ) + + def _accumulate_item(self, data, unique_state, value, granularity): + item_document_id = self._generate_document_id( + value, + granularity, + metric_scope="item", + ) + item_document = data.setdefault( + item_document_id, + self._build_document( + value=value, + granularity=granularity, + metric_scope="item", + ), + ) + self._apply_totals( + document=item_document, + click_timestamps=value.get("click_timestamps"), + click_timestamps_by_url=value.get("click_timestamps_by_url"), + content_type=value.get("content_type"), + ) + self._apply_uniques( + document=item_document, + unique_state=unique_state, + scope="item", + document_id=item_document_id, + user_session_id=value.get("user_session_id"), + is_request_event=is_request(value.get("content_type")), + ) + + def _accumulate_title( + self, data, unique_state, value, granularity, title_pid_generic + ): + title_document_id = self._generate_document_id( + value, + granularity, + metric_scope="title", + pid_generic=title_pid_generic, + ) + title_document = data.setdefault( + title_document_id, + self._build_document( + value=value, + granularity=granularity, + metric_scope="title", + pid_generic=title_pid_generic, + document_type="book", + ), + ) + self._apply_totals( + document=title_document, + click_timestamps=value.get("click_timestamps"), + click_timestamps_by_url=value.get("click_timestamps_by_url"), + content_type=value.get("content_type"), + ) + self._apply_uniques( + document=title_document, + unique_state=unique_state, + scope="title", + document_id=title_document_id, + user_session_id=value.get("user_session_id"), + is_request_event=is_request(value.get("content_type")), + ) + + def _build_document(self, value, granularity, **kwargs): + metric_scope = kwargs.get("metric_scope") or "item" + pid_generic = kwargs.get("pid_generic") + document_type = kwargs.get("document_type") + + document_id = pid_generic or value.get("pid_generic") + parent_id = _extract_title_pid_generic(value, fallback=document_id) + if parent_id == document_id or metric_scope == "title": + parent_id = None + raw_source = value.get("source") or {} + source = self._build_source(raw_source) + + base_document = { + "collection": value.get("collection"), + "source": source, + "document": self._build_document_section( + value=value, + document_id=document_id, + document_type=document_type or value.get("document_type"), + parent_id=parent_id, + source_identifiers=raw_source.get("identifiers"), + metric_scope=metric_scope, + ), + "counter": _strip_empty_values( + { + "metric_scope": metric_scope, + "data_type": "Book" if metric_scope == "title" else "Book_Segment", + "parent_data_type": "Book" if metric_scope != "title" else None, + "access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + } + ), + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + + base_document["access"] = self._build_access(value, granularity) + if granularity == "month": + base_document["daily_metrics"] = self._build_daily_metrics(value) + return base_document + + def _document_identifiers( + self, value, document_id, source_identifiers=None, metric_scope="item" + ): + if metric_scope == "title": + identifiers = _book_identifiers_from_pid(document_id) + identifiers.update(source_identifiers or {}) + return _strip_empty_identifiers(identifiers, canonical_id=document_id) + + document_identifiers = (value.get("document") or {}).get("identifiers") or {} + identifiers = { + "pid_v2": value.get("pid_v2"), + "pid_v3": value.get("pid_v3"), + "pid_generic": value.get("pid_generic"), + } + identifiers.update(document_identifiers) + identifiers.update(_book_identifiers_from_pid(value.get("pid_generic"))) + identifiers.update(source_identifiers or {}) + return _strip_empty_identifiers(identifiers, canonical_id=document_id) + + +def _should_create_item_document(value): + if not value.get("pid_generic"): + return False + if value.get("document_type") == "book" and not is_request( + value.get("content_type") + ): + return False + return True + + +def _extract_title_pid_generic(value, fallback=None): + title_pid_generic = value.get("title_pid_generic") + if title_pid_generic: + return title_pid_generic + + pid_generic = value.get("pid_generic") + if "/CHAPTER:" in (pid_generic or "").upper(): + return pid_generic.upper().split("/CHAPTER:")[0] + + source = value.get("source") or {} + source_id = source.get("source_id") + if source_id: + return f"BOOK:{str(source_id).upper()}" + + return fallback + + +def _book_identifiers_from_pid(pid_generic): + value = str(pid_generic or "") + if not value.upper().startswith("BOOK:"): + return {} + + identifiers = {} + parts = value.split("/", 1) + book_id = parts[0].split(":", 1)[1] if ":" in parts[0] else "" + if book_id: + identifiers["book_id"] = book_id + + if len(parts) > 1 and parts[1].upper().startswith("CHAPTER:"): + chapter_id = parts[1].split(":", 1)[1] if ":" in parts[1] else "" + if chapter_id: + identifiers["chapter_id"] = chapter_id + + return identifiers diff --git a/metrics/counter/indexing/engines/dataset.py b/metrics/counter/indexing/engines/dataset.py new file mode 100644 index 0000000..2ea60d8 --- /dev/null +++ b/metrics/counter/indexing/engines/dataset.py @@ -0,0 +1,5 @@ +from metrics.counter.indexing.engines.base import DocumentPipeline + + +class DatasetPipeline(DocumentPipeline): + data_type = "Dataset" diff --git a/metrics/counter/indexing/engines/preprint.py b/metrics/counter/indexing/engines/preprint.py new file mode 100644 index 0000000..5698be2 --- /dev/null +++ b/metrics/counter/indexing/engines/preprint.py @@ -0,0 +1,17 @@ +from metrics.counter.indexing.engines.base import DocumentPipeline, _strip_empty_values + + +class PreprintPipeline(DocumentPipeline): + data_type = "Article" + + def _build_counter_section(self, value): + return _strip_empty_values( + { + "metric_scope": "item", + "data_type": self.data_type, + "parent_data_type": self._resolve_parent_data_type(value), + "article_version": "Preprint", + "access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + } + ) diff --git a/metrics/counter/parser.py b/metrics/counter/parser.py deleted file mode 100644 index 2081e5d..0000000 --- a/metrics/counter/parser.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging - -from scielo_usage_counter.translator.classic import URLTranslatorClassicSite -from scielo_usage_counter.translator.books import URLTranslatorBooksSite -from scielo_usage_counter.translator.dataverse import URLTranslatorDataverseSite -from scielo_usage_counter.translator.opac import URLTranslatorOPACSite -from scielo_usage_counter.translator.opac_alpha import URLTranslatorOPACAlphaSite -from scielo_usage_counter.translator.preprints import URLTranslatorPreprintsSite - -from core.utils.date_utils import get_date_obj - - -def extract_date_from_validation_dict(validation): - """ - Extracts the date from the validation dict of a log file. - - Args: - validation (dict): The validation dict of the log file. - - Returns: - datetime.date: The extracted date. - """ - try: - date_str = validation.get('probably_date') - return get_date_obj(date_str, '%Y-%m-%d') - except Exception as e: - logging.error(f"Failed to extract date from validation: {e}") - return None - - -def translator_class_name_to_obj(name: str): - """ - Translates a class name to a class object." - - Parameters: - name (str): The name of the URL translator site. - """ - if not name or not isinstance(name, str): - return None - - translator_classes = { - 'books': URLTranslatorBooksSite, - 'classic': URLTranslatorClassicSite, - 'dataverse': URLTranslatorDataverseSite, - 'opac': URLTranslatorOPACSite, - 'opac_alpha': URLTranslatorOPACAlphaSite, - 'preprints': URLTranslatorPreprintsSite - } - return translator_classes.get(name.lower()) diff --git a/metrics/management/__init__.py b/metrics/management/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/metrics/management/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/metrics/management/commands/__init__.py b/metrics/management/commands/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/metrics/management/commands/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/metrics/management/commands/export_book_r51_monthly_metrics.py b/metrics/management/commands/export_book_r51_monthly_metrics.py deleted file mode 100644 index 1d78df0..0000000 --- a/metrics/management/commands/export_book_r51_monthly_metrics.py +++ /dev/null @@ -1,445 +0,0 @@ -import csv -import json -from collections import defaultdict -from pathlib import Path - -from device_detector import DeviceDetector -from django.core.management.base import BaseCommand, CommandError -from scielo_usage_counter.translator.books import URLTranslatorBooksSite - -from collection.models import Collection -from document.models import Document -from metrics.counter import access -from metrics.counter import documents as index_docs -from resources.models import MMDB, RobotUserAgent -from scielo_usage_counter import log_handler, url_translator -from source.models import Source - - -class Command(BaseCommand): - help = ( - "Generate COUNTER R5.1 monthly book metrics from one or more log files, " - "writing item and title CSV outputs." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--input", - dest="inputs", - action="append", - required=True, - help="Input log file path. Repeat --input for multiple files.", - ) - parser.add_argument( - "--item-output", - required=True, - help="Output CSV path for item-level monthly metrics.", - ) - parser.add_argument( - "--title-output", - required=True, - help="Output CSV path for title-level monthly metrics.", - ) - parser.add_argument( - "--summary-output", - help="Optional JSON path with parse and totals summary.", - ) - parser.add_argument( - "--collection", - default="books", - help="Collection acronym (default: books).", - ) - parser.add_argument( - "--robots-source", - choices=sorted(RobotUserAgent.SOURCE_CHOICES), - default=RobotUserAgent.SOURCE_ALL, - help="Which active robot list to use: all, counter, or scielo.", - ) - - def handle(self, *args, **options): - input_paths = [Path(value).expanduser() for value in options["inputs"]] - item_output = Path(options["item_output"]).expanduser() - title_output = Path(options["title_output"]).expanduser() - summary_output = ( - Path(options["summary_output"]).expanduser() - if options.get("summary_output") - else None - ) - - for path in input_paths: - if not path.exists(): - raise CommandError(f"Input file not found: {path}") - - collection = Collection.objects.filter(acron3=options["collection"]).first() - if not collection: - raise CommandError(f"Collection not found: {options['collection']}") - - robots_source = options["robots_source"] - robots_list = RobotUserAgent.get_patterns(source=robots_source) - if not robots_list: - raise CommandError( - f"No robot user agents found in database for source {robots_source}." - ) - - mmdb = MMDB.objects.order_by("-created").first() - if not mmdb: - raise CommandError("No MMDB found in database.") - - parser = log_handler.LogParser( - mmdb_data=mmdb.data, - robots_list=robots_list, - output_mode="dict", - ) - utm = url_translator.URLTranslationManager( - documents_metadata=Document.metadata(collection=collection), - sources_metadata=Source.metadata(collection=collection), - translator=URLTranslatorBooksSite, - ) - - results = {} - parse_summaries = [] - ua_cache = {} - - for path in input_paths: - self.stdout.write(f"Processing {path}...") - parse_summaries.append( - self._parse_file( - path=path, - parser=parser, - utm=utm, - collection=collection, - ua_cache=ua_cache, - results=results, - ) - ) - - monthly_documents = self._build_monthly_documents(results) - - self._write_item_csv(item_output, monthly_documents["item"]) - self._write_title_csv(title_output, monthly_documents["title"]) - - summary = { - "robots_source": robots_source, - "raw_result_count": len(results), - "parse_summaries": parse_summaries, - "totals": { - "total_item_requests": sum( - doc.get("total_requests", 0) for doc in monthly_documents["item"] - ), - "total_item_investigations": sum( - doc.get("total_investigations", 0) - for doc in monthly_documents["item"] - ), - "unique_item_requests": sum( - doc.get("unique_requests", 0) for doc in monthly_documents["item"] - ), - "unique_item_investigations": sum( - doc.get("unique_investigations", 0) - for doc in monthly_documents["item"] - ), - "title_total_item_requests": sum( - doc.get("total_requests", 0) for doc in monthly_documents["title"] - ), - "title_total_item_investigations": sum( - doc.get("total_investigations", 0) - for doc in monthly_documents["title"] - ), - "unique_title_requests": sum( - doc.get("unique_requests", 0) for doc in monthly_documents["title"] - ), - "unique_title_investigations": sum( - doc.get("unique_investigations", 0) - for doc in monthly_documents["title"] - ), - }, - } - - if summary_output: - summary_output.parent.mkdir(parents=True, exist_ok=True) - summary_output.write_text(json.dumps(summary, indent=2, sort_keys=True)) - - self.stdout.write(self.style.SUCCESS(f"Item CSV written to {item_output}")) - self.stdout.write(self.style.SUCCESS(f"Title CSV written to {title_output}")) - if summary_output: - self.stdout.write( - self.style.SUCCESS(f"Summary JSON written to {summary_output}") - ) - - def _parse_file(self, path, parser, utm, collection, ua_cache, results): - stats = defaultdict(int) - imported = 0 - - with path.open("rb") as fh: - for raw_line in fh: - stats["lines_parsed"] += 1 - - try: - line = raw_line.decode().strip() - except UnicodeDecodeError: - line = raw_line.decode("utf-8", errors="ignore").strip() - - match, ip_value = parser.match_with_best_pattern(line) - if not match: - stats["total_ignored_lines"] += 1 - continue - - data = match.groupdict() - is_bunny = "unix_ts" in data - method = "GET" if is_bunny else data.get("method") - status = data.get("status") - user_agent = parser.format_user_agent(data.get("user_agent")) - url = data.get("path") - ip_address = ip_value - - if not parser.has_valid_method(method): - stats["ignored_lines_invalid_method"] += 1 - stats["total_ignored_lines"] += 1 - continue - - if not parser.has_valid_status(status): - if parser.status_is_redirect(status): - stats["ignored_lines_http_redirects"] += 1 - elif parser.status_is_error(status): - stats["ignored_lines_http_errors"] += 1 - stats["total_ignored_lines"] += 1 - continue - - if parser.user_agent_is_bot(user_agent): - stats["ignored_lines_bot"] += 1 - stats["total_ignored_lines"] += 1 - continue - - if not parser.has_supported_url(url): - stats["ignored_lines_static_resources"] += 1 - stats["total_ignored_lines"] += 1 - continue - - if is_bunny: - local_datetime = parser.format_date(data.get("unix_ts"), None) - country_code = data.get( - "country" - ) or parser.geoip.ip_to_country_code(ip_address) - else: - local_datetime = parser.format_date( - data.get("date"), data.get("timezone") - ) - country_code = parser.geoip.ip_to_country_code(ip_address) - - if not local_datetime: - stats["ignored_lines_invalid_local_datetime"] += 1 - stats["total_ignored_lines"] += 1 - continue - - if not country_code: - stats["ignored_lines_invalid_country_code"] += 1 - stats["total_ignored_lines"] += 1 - continue - - device = ua_cache.get(user_agent) - if device is None: - try: - device = DeviceDetector(user_agent).parse() - except ZeroDivisionError: - stats["ignored_lines_invalid_user_agent"] += 1 - stats["total_ignored_lines"] += 1 - ua_cache[user_agent] = False - continue - ua_cache[user_agent] = device - elif device is False: - stats["ignored_lines_invalid_user_agent"] += 1 - stats["total_ignored_lines"] += 1 - continue - - client_name = parser.format_client_name(device) - client_version = parser.format_client_version(device) - - if not client_name: - stats["ignored_lines_invalid_client_name"] += 1 - stats["total_ignored_lines"] += 1 - continue - - if not client_version: - stats["ignored_lines_invalid_client_version"] += 1 - stats["total_ignored_lines"] += 1 - continue - - translated = utm.translate(url) - item_access_data = access.extract_item_access_data( - collection.acron3, - translated, - ) - is_valid, _ = access.is_valid_item_access_data( - item_access_data, - utm, - ignore_utm_validation=True, - ) - if not is_valid: - stats["total_ignored_lines"] += 1 - continue - - access.update_results_with_item_access_data( - results, - item_access_data, - { - "client_name": client_name, - "client_version": client_version, - "ip_address": ip_address, - "country_code": country_code, - "local_datetime": local_datetime, - "url": url, - }, - ) - imported += 1 - stats["total_imported_lines"] += 1 - - return {"path": str(path), "valid_lines_used": imported, **stats} - - def _build_monthly_documents(self, results): - documents = index_docs.convert_raw_results_to_index_documents(results) - item_documents = {} - title_documents = {} - - for doc in documents["month"].values(): - access = doc.get("access") or {} - counter = doc.get("counter") or {} - document = doc.get("document") or {} - year_month = access.get("month", "") - scope = counter.get("metric_scope", "item") - if scope == "title": - title_id = document.get("id") - key = ( - year_month, - title_id, - document.get("type"), - ) - if key not in title_documents: - title_documents[key] = { - "year_month": year_month, - "title_pid_generic": title_id, - "document_type": document.get("type"), - "total_requests": 0, - "total_investigations": 0, - "unique_requests": 0, - "unique_investigations": 0, - } - title_documents[key]["total_requests"] += doc.get("total_requests", 0) - title_documents[key]["total_investigations"] += doc.get( - "total_investigations", 0 - ) - title_documents[key]["unique_requests"] += doc.get("unique_requests", 0) - title_documents[key]["unique_investigations"] += doc.get( - "unique_investigations", 0 - ) - continue - - item_id = document.get("id") - title_id = document.get("parent_id") or item_id - key = ( - year_month, - title_id, - item_id, - document.get("type"), - ) - if key not in item_documents: - item_documents[key] = { - "year_month": year_month, - "title_pid_generic": title_id, - "segment_pid_generic": item_id, - "document_type": document.get("type"), - "total_requests": 0, - "total_investigations": 0, - "unique_requests": 0, - "unique_investigations": 0, - } - item_documents[key]["total_requests"] += doc.get("total_requests", 0) - item_documents[key]["total_investigations"] += doc.get( - "total_investigations", 0 - ) - item_documents[key]["unique_requests"] += doc.get("unique_requests", 0) - item_documents[key]["unique_investigations"] += doc.get( - "unique_investigations", 0 - ) - - return { - "item": list(item_documents.values()), - "title": list(title_documents.values()), - } - - @staticmethod - def _write_item_csv(path, item_documents): - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", newline="") as fh: - writer = csv.DictWriter( - fh, - fieldnames=[ - "year_month", - "title_pid_generic", - "segment_pid_generic", - "document_type", - "total_item_requests", - "total_item_investigations", - "unique_item_requests", - "unique_item_investigations", - ], - ) - writer.writeheader() - for doc in sorted( - item_documents, - key=lambda item: ( - item.get("year_month", ""), - item.get("title_pid_generic") or "", - item.get("segment_pid_generic") or "", - ), - ): - writer.writerow( - { - "year_month": doc.get("year_month", ""), - "title_pid_generic": doc.get("title_pid_generic"), - "segment_pid_generic": doc.get("segment_pid_generic"), - "document_type": doc.get("document_type"), - "total_item_requests": doc.get("total_requests", 0), - "total_item_investigations": doc.get("total_investigations", 0), - "unique_item_requests": doc.get("unique_requests", 0), - "unique_item_investigations": doc.get( - "unique_investigations", 0 - ), - } - ) - - @staticmethod - def _write_title_csv(path, title_documents): - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", newline="") as fh: - writer = csv.DictWriter( - fh, - fieldnames=[ - "year_month", - "title_pid_generic", - "document_type", - "total_item_requests", - "total_item_investigations", - "unique_title_requests", - "unique_title_investigations", - ], - ) - writer.writeheader() - for doc in sorted( - title_documents, - key=lambda item: ( - item.get("year_month", ""), - item.get("title_pid_generic") or "", - ), - ): - writer.writerow( - { - "year_month": doc.get("year_month", ""), - "title_pid_generic": doc.get("title_pid_generic"), - "document_type": doc.get("document_type"), - "total_item_requests": doc.get("total_requests", 0), - "total_item_investigations": doc.get("total_investigations", 0), - "unique_title_requests": doc.get("unique_requests", 0), - "unique_title_investigations": doc.get( - "unique_investigations", 0 - ), - } - ) diff --git a/metrics/management/commands/schedule_cleanup_daily_payloads.py b/metrics/management/commands/schedule_cleanup_daily_payloads.py deleted file mode 100644 index 285a23f..0000000 --- a/metrics/management/commands/schedule_cleanup_daily_payloads.py +++ /dev/null @@ -1,68 +0,0 @@ -from django.core.management.base import BaseCommand - -from core.utils.scheduler import schedule_task -from metrics.tasks import task_cleanup_daily_payloads - - -class Command(BaseCommand): - help = ( - "Schedule the periodic cleanup of exported daily metric payload files. " - "Runs weekly on Sunday at 03:00 UTC by default, deleting payload files " - "for jobs that were exported more than 7 days ago." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--day-of-week", - default="0", - help="Crontab day of week (0=Sunday, 6=Saturday). Default: 0", - ) - parser.add_argument( - "--hour", - default="3", - help="Crontab hour (0-23). Default: 3", - ) - parser.add_argument( - "--minute", - default="0", - help="Crontab minute (0-59). Default: 0", - ) - parser.add_argument( - "--older-than-days", - type=int, - default=7, - help="Only delete payloads exported more than N days ago. Default: 7", - ) - parser.add_argument( - "--collection", - action="append", - dest="collections", - help="Limit cleanup to a specific collection acronym. Repeat for multiple.", - ) - - def handle(self, *args, **options): - celery_task_name = task_cleanup_daily_payloads.name - - kwargs = { - "older_than_days": options["older_than_days"], - "collections": options.get("collections") or [], - } - - schedule_task( - task=celery_task_name, - name=celery_task_name, - kwargs=kwargs, - description="Weekly cleanup of exported daily payload files from disk.", - day_of_week=options["day_of_week"], - hour=options["hour"], - minute=options["minute"], - ) - - self.stdout.write( - self.style.SUCCESS( - f"Scheduled periodic task '{celery_task_name}' " - f"(day_of_week={options['day_of_week']}, hour={options['hour']}, " - f"minute={options['minute']}, older_than_days={kwargs['older_than_days']}, " - f"collections={kwargs['collections'] or 'all'})." - ) - ) diff --git a/metrics/migrations/0001_initial.py b/metrics/migrations/0001_initial.py index 9746d5f..bfae3b5 100644 --- a/metrics/migrations/0001_initial.py +++ b/metrics/migrations/0001_initial.py @@ -28,11 +28,15 @@ class Migration(migrations.Migration): ), ( "created", - models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), + models.DateTimeField( + auto_now_add=True, verbose_name="Creation date" + ), ), ( "updated", - models.DateTimeField(auto_now=True, verbose_name="Last update date"), + models.DateTimeField( + auto_now=True, verbose_name="Last update date" + ), ), ( "access_date", @@ -85,7 +89,9 @@ class Migration(migrations.Migration): ), ( "error_message", - models.TextField(blank=True, default="", verbose_name="Error Message"), + models.TextField( + blank=True, default="", verbose_name="Error Message" + ), ), ( "export_started_at", @@ -97,7 +103,9 @@ class Migration(migrations.Migration): ), ( "exported_at", - models.DateTimeField(blank=True, null=True, verbose_name="Exported At"), + models.DateTimeField( + blank=True, null=True, verbose_name="Exported At" + ), ), ( "collection", diff --git a/metrics/models.py b/metrics/models.py index aa789b5..2a7e8b2 100644 --- a/metrics/models.py +++ b/metrics/models.py @@ -100,8 +100,13 @@ class Meta: verbose_name_plural = _("Daily Metric Jobs") unique_together = (("collection", "access_date"),) indexes = [ - models.Index(fields=["collection", "access_date"], name="metrics_daily_coll_date_idx"), - models.Index(fields=["status", "export_started_at"], name="metrics_daily_status_exp_idx"), + models.Index( + fields=["collection", "access_date"], name="metrics_daily_coll_date_idx" + ), + models.Index( + fields=["status", "export_started_at"], + name="metrics_daily_status_exp_idx", + ), ] def __str__(self): diff --git a/metrics/opensearch/__init__.py b/metrics/opensearch/__init__.py index fb9df20..e69de29 100644 --- a/metrics/opensearch/__init__.py +++ b/metrics/opensearch/__init__.py @@ -1,8 +0,0 @@ -from .client import OpenSearchUsageClient -from .mappings import ( - BOOKS_MONTH_INDEX_MAPPINGS, - BOOKS_YEAR_INDEX_MAPPINGS, - MONTH_INDEX_MAPPINGS, - YEAR_INDEX_MAPPINGS, - get_index_mappings, -) diff --git a/metrics/opensearch/client.py b/metrics/opensearch/client.py index ce0de5c..271acee 100644 --- a/metrics/opensearch/client.py +++ b/metrics/opensearch/client.py @@ -3,12 +3,9 @@ from django.conf import settings from opensearchpy import NotFoundError, OpenSearch, helpers +from metrics.opensearch.mappings import get_index_mappings from metrics.opensearch.names import generate_month_index_name, generate_year_index_name - -from .mappings import get_index_mappings -from .scripts import ( - IDEMPOTENT_JOB_INCREMENT_SCRIPT, - METRIC_FIELDS, +from metrics.opensearch.painless import ( build_idempotent_job_increment_action, merge_metric_document, ) @@ -18,7 +15,13 @@ class OpenSearchUsageClient: def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=None): self.client = self.get_opensearch_client(url, basic_auth, api_key, verify_certs) - def get_opensearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=None): + def get_opensearch_client( + self, + url=None, + basic_auth=None, + api_key=None, + verify_certs=None, + ): url = url or getattr(settings, "OPENSEARCH_URL", None) basic_auth = basic_auth or getattr(settings, "OPENSEARCH_BASIC_AUTH", None) api_key = api_key or getattr(settings, "OPENSEARCH_API_KEY", None) @@ -26,7 +29,11 @@ def get_opensearch_client(self, url=None, basic_auth=None, api_key=None, verify_ verify_certs = getattr(settings, "OPENSEARCH_VERIFY_CERTS", False) if basic_auth: - return OpenSearch(url, http_auth=tuple(basic_auth), verify_certs=verify_certs) + return OpenSearch( + url, + http_auth=tuple(basic_auth), + verify_certs=verify_certs, + ) if api_key: return OpenSearch(url, api_key=api_key, verify_certs=verify_certs) return OpenSearch(url, verify_certs=verify_certs) @@ -56,23 +63,32 @@ def create_index_if_not_exists(self, index_name, mappings, ping_client=False): return if not self.client.indices.exists(index=index_name): - self.create_index(index_name=index_name, mappings=mappings, ping_client=False) + self.create_index( + index_name=index_name, + mappings=mappings, + ping_client=False, + ) def ensure_usage_indexes(self, collection, access_date, index_prefix=None): - index_prefix = index_prefix or getattr(settings, "OPENSEARCH_INDEX_NAME", "usage") + index_prefix = index_prefix or getattr( + settings, + "OPENSEARCH_INDEX_NAME", + "usage", + ) year_index = generate_year_index_name(index_prefix, collection, access_date) month_index = generate_month_index_name(index_prefix, collection, access_date) - self.create_index_if_not_exists(year_index, get_index_mappings(collection, "year")) - self.create_index_if_not_exists(month_index, get_index_mappings(collection, "month")) + self.create_index_if_not_exists( + year_index, + get_index_mappings(collection, "year"), + ) + self.create_index_if_not_exists( + month_index, + get_index_mappings(collection, "month"), + ) return {"year": year_index, "month": month_index} - def delete_index(self, index_name, ping_client=False): - if ping_client and not self.ping(): - return - self.client.indices.delete(index=index_name) - def index_documents(self, index_name, documents, ping_client=False): if ping_client and not self.ping(): return @@ -207,12 +223,19 @@ def sync_documents(self, index_name, documents, operation="add", ping_client=Fal if not documents: return - existing_documents = self.fetch_documents_by_ids(index_name=index_name, doc_ids=list(documents.keys())) + existing_documents = self.fetch_documents_by_ids( + index_name=index_name, + doc_ids=list(documents.keys()), + ) upserts = {} deletes = [] for doc_id, document in documents.items(): - merged = merge_metric_document(existing_documents.get(doc_id), document, operation=operation) + merged = merge_metric_document( + existing_documents.get(doc_id), + document, + operation=operation, + ) if merged is None: if doc_id in existing_documents: deletes.append(doc_id) diff --git a/metrics/opensearch/mappings.py b/metrics/opensearch/mappings.py index def652f..de4dbae 100644 --- a/metrics/opensearch/mappings.py +++ b/metrics/opensearch/mappings.py @@ -1,6 +1,6 @@ -TEXT_KEYWORD_MAPPING = { +DISPLAY_TEXT_MAPPING = { "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 512}}, + "index": False, } IDENTIFIERS_MAPPING = {"type": "object", "dynamic": True} @@ -9,7 +9,7 @@ "properties": { "id": {"type": "keyword"}, "type": {"type": "keyword"}, - "title": TEXT_KEYWORD_MAPPING, + "title": DISPLAY_TEXT_MAPPING, "parent_id": {"type": "keyword"}, "publication_year": {"type": "integer"}, "identifiers": IDENTIFIERS_MAPPING, @@ -20,10 +20,10 @@ "properties": { "id": {"type": "keyword"}, "type": {"type": "keyword"}, - "title": TEXT_KEYWORD_MAPPING, + "title": DISPLAY_TEXT_MAPPING, "scielo_issn": {"type": "keyword"}, "acronym": {"type": "keyword"}, - "publisher_name": {"type": "keyword"}, + "publisher_name": DISPLAY_TEXT_MAPPING, "access_type": {"type": "keyword"}, "city": {"type": "keyword"}, "country": {"type": "keyword"}, @@ -89,14 +89,6 @@ def _build_index_mappings(granularity): BOOKS_MONTH_INDEX_MAPPINGS = _build_index_mappings("month") -METRIC_FIELDS = ( - "total_requests", - "total_investigations", - "unique_requests", - "unique_investigations", -) - - def get_index_mappings(collection, granularity): if granularity not in {"month", "year"}: raise ValueError("Granularity must be 'month' or 'year'.") diff --git a/metrics/opensearch/names.py b/metrics/opensearch/names.py index 1ecd493..b567d11 100644 --- a/metrics/opensearch/names.py +++ b/metrics/opensearch/names.py @@ -1,7 +1,7 @@ -from django.conf import settings +from config.collections import get_collection_size -def _validate_index_inputs(index_prefix: str, collection: str, date: str): +def _validate_index_inputs(index_prefix, collection, date): if not date or not isinstance(date, str): raise ValueError("Date must be a non-empty string in 'YYYY-MM-DD' format.") if not collection or not isinstance(collection, str): @@ -10,32 +10,22 @@ def _validate_index_inputs(index_prefix: str, collection: str, date: str): raise ValueError("Index prefix must be a non-empty string.") -def _get_collection_size(collection: str) -> str: - return getattr(settings, "COLLECTION_ACRON3_SIZE_MAP", {}).get(collection, "small") - - -def extract_access_year(date: str) -> str: +def extract_access_year(date): _validate_index_inputs("usage", "tmp", date) return date.split("-")[0] -def extract_access_month(date: str) -> str: - _validate_index_inputs("usage", "tmp", date) - year, month, _ = date.split("-") - return f"{year}{month}" - - -def generate_month_index_name(index_prefix: str, collection: str, date: str) -> str: +def generate_month_index_name(index_prefix, collection, date): _validate_index_inputs(index_prefix, collection, date) - size = _get_collection_size(collection) + size = get_collection_size(collection) if size in ("xlarge", "large"): return f"{index_prefix}_monthly_{collection}_{extract_access_year(date)}" return f"{index_prefix}_monthly_{collection}" -def generate_year_index_name(index_prefix: str, collection: str, date: str) -> str: +def generate_year_index_name(index_prefix, collection, date): _validate_index_inputs(index_prefix, collection, date) - size = _get_collection_size(collection) + size = get_collection_size(collection) if size in ("xlarge", "large"): return f"{index_prefix}_yearly_{collection}_{extract_access_year(date)}" return f"{index_prefix}_yearly_{collection}" diff --git a/metrics/opensearch/scripts.py b/metrics/opensearch/painless.py similarity index 91% rename from metrics/opensearch/scripts.py rename to metrics/opensearch/painless.py index a6a5e1c..de3de81 100644 --- a/metrics/opensearch/scripts.py +++ b/metrics/opensearch/painless.py @@ -14,7 +14,9 @@ return; } for (entry in params.document.entrySet()) { - if (!params.metric_fields.contains(entry.getKey()) && !'applied_jobs'.equals(entry.getKey()) && !'daily_metrics'.equals(entry.getKey())) { + if (!params.metric_fields.contains(entry.getKey()) + && !'applied_jobs'.equals(entry.getKey()) + && !'daily_metrics'.equals(entry.getKey())) { if (!ctx._source.containsKey(entry.getKey()) || ctx._source[entry.getKey()] != entry.getValue()) { ctx._source[entry.getKey()] = entry.getValue(); } @@ -36,7 +38,8 @@ def dayMetrics = dayEntry.getValue(); ctx._source.daily_metrics[day] = new HashMap(); } for (metric in params.metric_fields) { - def currentValue = ctx._source.daily_metrics[day].containsKey(metric) ? ctx._source.daily_metrics[day][metric] : 0; + def currentValue = ctx._source.daily_metrics[day].containsKey(metric) + ? ctx._source.daily_metrics[day][metric] : 0; def increment = dayMetrics.containsKey(metric) ? dayMetrics[metric] : 0; ctx._source.daily_metrics[day][metric] = currentValue + increment; } @@ -92,7 +95,9 @@ def merge_metric_document(existing, current, operation="add"): for day, metrics in current["daily_metrics"].items(): day_merged = dict(merged_daily.get(day) or {}) for field in METRIC_FIELDS: - day_merged[field] = day_merged.get(field, 0) + signal * metrics.get(field, 0) + day_merged[field] = day_merged.get(field, 0) + signal * metrics.get( + field, 0 + ) merged_daily[day] = day_merged merged["daily_metrics"] = merged_daily diff --git a/metrics/services/__init__.py b/metrics/services/__init__.py index b305681..e69de29 100644 --- a/metrics/services/__init__.py +++ b/metrics/services/__init__.py @@ -1,26 +0,0 @@ -from .jobs import ( - acquire_daily_metric_job, - create_or_update_daily_metric_job, - mark_daily_metric_job_exported, - mark_daily_metric_job_failed, - release_stale_daily_metric_jobs, -) -from .resources import ( - build_search_client, - extract_celery_queue_name, - fetch_required_resources, - get_log_files_for_collection_date, -) -from .parser import ( - is_stale_parsing_log, - process_daily_metric_job, - process_line, - requeue_stale_parsing_log, - setup_parsing_environment, - touch_parse_heartbeat, -) -from .export import ( - export_daily_metric_payload, - export_documents, - load_daily_metric_payload, -) diff --git a/metrics/services/daily_metric_exports.py b/metrics/services/daily_metric_exports.py new file mode 100644 index 0000000..8933b3d --- /dev/null +++ b/metrics/services/daily_metric_exports.py @@ -0,0 +1,71 @@ +import logging + +from metrics.models import DailyMetricJob +from metrics.opensearch.client import OpenSearchUsageClient +from metrics.services.export import ( + export_daily_metric_payload, + load_daily_metric_payload, +) +from metrics.services.jobs import ( + acquire_daily_metric_job, + mark_daily_metric_job_exported, + mark_daily_metric_job_failed, +) +from metrics.services.parsing.job_payloads import build_daily_metric_job_payload +from metrics.services.resources import fetch_required_resources + + +def build_and_export_daily_metric_job(job_id, track_errors=False, robots_source=None): + try: + job = acquire_daily_metric_job(job_id) + except DailyMetricJob.DoesNotExist: + logging.error("Daily metric job %s does not exist.", job_id) + return + + if not job: + return + + try: + payload = _load_or_build_payload( + job=job, + track_errors=track_errors, + robots_source=robots_source, + ) + _export_payload(job=job, payload=payload) + except Exception as exc: + logging.error("Failed to process daily metric job %s: %s", job_id, exc) + mark_daily_metric_job_failed(job, exc) + return + + mark_daily_metric_job_exported(job) + + +def _load_or_build_payload(job, track_errors, robots_source): + payload = load_daily_metric_payload(job) + if payload is not None and job.payload_hash: + return payload + + robots_list, mmdb = fetch_required_resources(robot_source=robots_source) + if not robots_list or not mmdb: + raise RuntimeError("Required parsing resources are not available.") + + payload = build_daily_metric_job_payload( + job=job, + robots_list=robots_list, + mmdb=mmdb, + track_errors=track_errors, + ) + job.refresh_from_db() + return payload + + +def _export_payload(job, payload): + search_client = OpenSearchUsageClient() + if not search_client.ping(): + raise RuntimeError("OpenSearch client is not available.") + + export_daily_metric_payload( + search_client=search_client, + job=job, + payload=payload, + ) diff --git a/metrics/services/daily_payloads.py b/metrics/services/daily_payloads.py index 8b96f7b..f908f1f 100644 --- a/metrics/services/daily_payloads.py +++ b/metrics/services/daily_payloads.py @@ -28,20 +28,13 @@ def resolve_storage_path(storage_path): return get_daily_payload_root() / storage_path -def serialize_payload(payload): - return json.dumps( - payload, - ensure_ascii=True, - sort_keys=True, - separators=(",", ":"), - ) - - def write_payload(storage_path, payload): resolved_path = resolve_storage_path(storage_path) resolved_path.parent.mkdir(parents=True, exist_ok=True) - payload_json = serialize_payload(payload) + payload_json = json.dumps( + payload, ensure_ascii=True, sort_keys=True, separators=(",", ":") + ) payload_hash = hashlib.sha256(payload_json.encode("utf-8")).hexdigest() tmp_path = resolved_path.with_suffix(f"{resolved_path.suffix}.tmp") @@ -56,18 +49,16 @@ def read_payload(storage_path): return json.loads(resolved_path.read_text(encoding="utf-8")) -def delete_payload(storage_path): - resolved_path = resolve_storage_path(storage_path) - if resolved_path.exists(): - resolved_path.unlink() - - def cleanup_exported_payloads(collections=None, older_than_days=7): root = get_daily_payload_root() if not root.exists(): return 0 - cutoff = timezone.now() - timedelta(days=older_than_days) if older_than_days and older_than_days > 0 else None + cutoff = ( + timezone.now() - timedelta(days=older_than_days) + if older_than_days and older_than_days > 0 + else None + ) storage_path_to_job = {} db_queryset = DailyMetricJob.objects.exclude(storage_path="") @@ -78,11 +69,13 @@ def cleanup_exported_payloads(collections=None, older_than_days=7): json_files = root.rglob("*.json") if collections: - json_files = [p for p in json_files if p.relative_to(root).parts[0] in collections] + json_files = [ + p for p in json_files if p.relative_to(root).parts[0] in collections + ] deleted_count = 0 for file_path in json_files: - if cutoff and _file_is_recent(file_path, cutoff): + if cutoff and file_path.stat().st_mtime >= cutoff.timestamp(): continue storage_path = file_path.relative_to(root).as_posix() @@ -113,10 +106,6 @@ def cleanup_exported_payloads(collections=None, older_than_days=7): return deleted_count -def _file_is_recent(file_path, cutoff): - return file_path.stat().st_mtime >= cutoff.timestamp() - - def _cleanup_empty_dirs(root): for dirpath, dirnames, filenames in os.walk(root, topdown=False): if dirpath == str(root): diff --git a/metrics/services/export.py b/metrics/services/export.py index ef5d9f6..4c3def9 100644 --- a/metrics/services/export.py +++ b/metrics/services/export.py @@ -2,10 +2,9 @@ from django.conf import settings -from metrics import opensearch +from metrics.opensearch.mappings import get_index_mappings from metrics.opensearch.names import generate_month_index_name, generate_year_index_name - -from . import daily_payloads +from metrics.services import daily_payloads def load_daily_metric_payload(job): @@ -71,14 +70,14 @@ def _sync_documents_group( collection=collection, date=f"{access.get('month')}-01", ) - mappings = opensearch.get_index_mappings(collection, "month") + mappings = get_index_mappings(collection, "month") else: index_name = generate_year_index_name( index_prefix=index_prefix, collection=collection, date=f"{access.get('year')}-01-01", ) - mappings = opensearch.get_index_mappings(collection, "year") + mappings = get_index_mappings(collection, "year") grouped_documents.setdefault( index_name, {"mappings": mappings, "documents": {}} diff --git a/metrics/services/jobs.py b/metrics/services/jobs.py index 78f5100..3456182 100644 --- a/metrics/services/jobs.py +++ b/metrics/services/jobs.py @@ -6,7 +6,6 @@ from log_manager import choices from log_manager.models import LogFile - from metrics.models import DailyMetricJob @@ -74,7 +73,9 @@ def acquire_daily_metric_job(job_id): DailyMetricJob.STATUS_EXPORTING, DailyMetricJob.STATUS_EXPORTED, }: - logging.info("Daily metric job %s is already in final/active state.", job_id) + logging.info( + "Daily metric job %s is already in final/active state.", job_id + ) return None job.status = DailyMetricJob.STATUS_EXPORTING @@ -106,7 +107,7 @@ def mark_daily_metric_job_failed(job, error_message): ) -def mark_daily_metric_job_exported(job, user=None): +def mark_daily_metric_job_exported(job): DailyMetricJob.objects.filter(pk=job.pk).update( status=DailyMetricJob.STATUS_EXPORTED, error_message="", @@ -120,7 +121,12 @@ def mark_daily_metric_job_exported(job, user=None): ) -def release_stale_daily_metric_jobs(collections=None, from_date=None, until_date=None, stale_after_minutes=60): +def release_stale_daily_metric_jobs( + collections=None, + from_date=None, + until_date=None, + stale_after_minutes=60, +): cutoff = timezone.now() - timedelta(minutes=stale_after_minutes) queryset = DailyMetricJob.objects.filter( status=DailyMetricJob.STATUS_EXPORTING, @@ -140,9 +146,7 @@ def release_stale_daily_metric_jobs(collections=None, from_date=None, until_date updated=timezone.now(), ) stale_hashes = { - log_hash - for job in stale_jobs - for log_hash in (job.input_log_hashes or []) + log_hash for job in stale_jobs for log_hash in (job.input_log_hashes or []) } if stale_hashes: LogFile.objects.filter(hash__in=stale_hashes).update( diff --git a/metrics/services/log_parsing_jobs.py b/metrics/services/log_parsing_jobs.py new file mode 100644 index 0000000..de5b20f --- /dev/null +++ b/metrics/services/log_parsing_jobs.py @@ -0,0 +1,395 @@ +from collection.models import Collection +from config.collections import get_collection_parse_queue +from core.utils.date_utils import get_date_obj, get_date_range_str +from log_manager import choices +from log_manager.models import LogFile +from metrics.models import DailyMetricJob +from metrics.services.jobs import create_or_update_daily_metric_job + +AUTO_REEXECUTE_POLL_INTERVAL_SECONDS = 30 + + +def enqueue_log_parsing_jobs( + daily_metric_export_task, + wait_log_parsing_wave_task, + collections=None, + include_logs_with_error=True, + batch_size=5000, + max_log_files=None, + auto_reexecute=False, + replace=False, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + queue_name=None, + user_id=None, + username=None, + skip_log_hashes=None, + robots_source=None, +): + from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) + from_date_obj = get_date_obj(from_date) + until_date_obj = get_date_obj(until_date) + enqueued_logs = 0 + enqueued_jobs = 0 + reached_max_log_files = False + enqueued_wave_job_ids = [] + claimed_status_filters = list(_build_log_status_filters(include_logs_with_error)) + skip_log_hashes = set(skip_log_hashes or []) + + for collection in collections or Collection.acron3_list(): + collection_obj = Collection.objects.filter(acron3=collection).first() + if collection_obj is None: + continue + + result = _enqueue_collection_daily_jobs( + daily_metric_export_task=daily_metric_export_task, + collection=collection_obj, + from_date_obj=from_date_obj, + until_date_obj=until_date_obj, + status_filters=claimed_status_filters, + skip_log_hashes=skip_log_hashes, + enqueued_logs=enqueued_logs, + max_log_files=max_log_files, + track_errors=track_errors, + user_id=user_id, + username=username, + robots_source=robots_source, + queue_name=queue_name, + ) + + enqueued_logs += result["enqueued_logs"] + enqueued_jobs += result["enqueued_jobs"] + enqueued_wave_job_ids.extend(result["enqueued_wave_job_ids"]) + reached_max_log_files = result["reached_max_log_files"] + if result["reached_max_log_files"]: + break + + auto_reexecution_enqueued = _schedule_log_parsing_reexecution( + wait_log_parsing_wave_task=wait_log_parsing_wave_task, + should_reexecute=( + auto_reexecute and reached_max_log_files and bool(enqueued_wave_job_ids) + ), + wave_job_ids=enqueued_wave_job_ids, + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=sorted(skip_log_hashes), + robots_source=robots_source, + ) + + return { + "enqueued_logs": enqueued_logs, + "enqueued_jobs": enqueued_jobs, + "reached_max_log_files": reached_max_log_files, + "auto_reexecution_enqueued": auto_reexecution_enqueued, + } + + +def wait_log_parsing_wave( + log_parsing_task, + wait_log_parsing_wave_task, + wave_job_ids=None, + collections=None, + include_logs_with_error=True, + batch_size=5000, + max_log_files=None, + auto_reexecute=False, + replace=False, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + queue_name=None, + user_id=None, + username=None, + skip_log_hashes=None, + poll_interval_seconds=AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, + robots_source=None, + wave_log_hashes=None, +): + wave_job_ids = wave_job_ids or wave_log_hashes or [] + if DailyMetricJob.objects.filter( + pk__in=wave_job_ids, + status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_EXPORTING], + ).exists(): + kwargs = _build_log_parsing_reexecution_kwargs( + wave_job_ids=wave_job_ids, + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=skip_log_hashes, + poll_interval_seconds=poll_interval_seconds, + robots_source=robots_source, + ) + apply_kwargs = { + "kwargs": kwargs, + "countdown": poll_interval_seconds, + } + if queue_name: + apply_kwargs["queue"] = queue_name + wait_log_parsing_wave_task.apply_async(**apply_kwargs) + return {"wave_completed": False, "reexecution_enqueued": False} + + kwargs = _build_log_parsing_kwargs( + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=skip_log_hashes, + robots_source=robots_source, + ) + apply_kwargs = {"kwargs": kwargs} + if queue_name: + apply_kwargs["queue"] = queue_name + log_parsing_task.apply_async(**apply_kwargs) + return {"wave_completed": True, "reexecution_enqueued": True} + + +def _build_log_status_filters(include_logs_with_error): + status_filters = [choices.LOG_FILE_STATUS_QUEUED] + if include_logs_with_error: + status_filters.append(choices.LOG_FILE_STATUS_ERROR) + return tuple(status_filters) + + +def _enqueue_collection_daily_jobs( + daily_metric_export_task, + collection, + from_date_obj, + until_date_obj, + status_filters, + skip_log_hashes, + enqueued_logs, + max_log_files, + track_errors, + user_id, + username, + robots_source, + queue_name, +): + result = { + "enqueued_logs": 0, + "enqueued_jobs": 0, + "enqueued_wave_job_ids": [], + "reached_max_log_files": False, + } + + access_dates = LogFile.distinct_access_dates_for_parsing( + collection=collection, + from_date=from_date_obj, + until_date=until_date_obj, + status_filters=status_filters, + skip_hashes=skip_log_hashes, + ) + + for access_date in access_dates: + log_files = LogFile.for_collection_date( + collection=collection, + access_date=access_date, + status_filters=status_filters, + ) + log_files = [ + log_file for log_file in log_files if log_file.hash not in skip_log_hashes + ] + + reached_limit = False + if max_log_files: + remaining_log_slots = max_log_files - ( + enqueued_logs + result["enqueued_logs"] + ) + if remaining_log_slots <= 0: + result["reached_max_log_files"] = True + break + if len(log_files) > remaining_log_slots: + log_files = log_files[:remaining_log_slots] + reached_limit = True + result["reached_max_log_files"] = reached_limit + + if not log_files: + continue + + job = create_or_update_daily_metric_job( + collection=collection, + access_date=access_date, + log_files=log_files, + ) + if job.status == DailyMetricJob.STATUS_EXPORTED: + if reached_limit: + break + continue + + daily_metric_export_task.apply_async( + args=(job.pk, track_errors, user_id, username, robots_source), + queue=queue_name or get_collection_parse_queue(collection.acron3), + ) + result["enqueued_wave_job_ids"].append(job.pk) + result["enqueued_jobs"] += 1 + result["enqueued_logs"] += len(log_files) + if max_log_files and enqueued_logs + result["enqueued_logs"] >= max_log_files: + result["reached_max_log_files"] = True + + if result["reached_max_log_files"]: + break + + return result + + +def _schedule_log_parsing_reexecution( + wait_log_parsing_wave_task, + should_reexecute, + wave_job_ids, + collections, + include_logs_with_error, + batch_size, + max_log_files, + auto_reexecute, + replace, + track_errors, + from_date, + until_date, + days_to_go_back, + queue_name, + user_id, + username, + skip_log_hashes, + robots_source=None, +): + if not should_reexecute: + return False + + kwargs = _build_log_parsing_reexecution_kwargs( + wave_job_ids=wave_job_ids, + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=skip_log_hashes, + poll_interval_seconds=AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, + robots_source=robots_source, + ) + + apply_kwargs = {"kwargs": kwargs} + if queue_name: + apply_kwargs["queue"] = queue_name + wait_log_parsing_wave_task.apply_async(**apply_kwargs) + return True + + +def _build_log_parsing_reexecution_kwargs( + wave_job_ids, + collections, + include_logs_with_error, + batch_size, + max_log_files, + auto_reexecute, + replace, + track_errors, + from_date, + until_date, + days_to_go_back, + queue_name, + user_id, + username, + skip_log_hashes, + poll_interval_seconds, + robots_source=None, +): + kwargs = { + "wave_job_ids": wave_job_ids, + "collections": collections, + "include_logs_with_error": include_logs_with_error, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": auto_reexecute, + "replace": replace, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "skip_log_hashes": skip_log_hashes, + "poll_interval_seconds": poll_interval_seconds, + } + if robots_source is not None: + kwargs["robots_source"] = robots_source + return kwargs + + +def _build_log_parsing_kwargs( + collections, + include_logs_with_error, + batch_size, + max_log_files, + auto_reexecute, + replace, + track_errors, + from_date, + until_date, + days_to_go_back, + queue_name, + user_id, + username, + skip_log_hashes, + robots_source=None, +): + kwargs = { + "collections": collections, + "include_logs_with_error": include_logs_with_error, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": auto_reexecute, + "replace": replace, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "skip_log_hashes": skip_log_hashes, + } + if robots_source is not None: + kwargs["robots_source"] = robots_source + return kwargs diff --git a/metrics/services/parser.py b/metrics/services/parser.py deleted file mode 100644 index 5eb3dbf..0000000 --- a/metrics/services/parser.py +++ /dev/null @@ -1,249 +0,0 @@ -import logging -from datetime import timedelta -from time import monotonic - -from django.conf import settings -from django.utils import timezone - -from scielo_usage_counter import log_handler, url_translator - -from log_manager import choices -from log_manager.models import LogFile -from log_manager_config.models import CollectionLogDirectory -from source.models import Source -from document.models import Document -from tracker.choices import ( - LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, - LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, -) -from tracker.models import LogFileDiscardedLine - -from metrics.counter import access, documents as index_docs -from metrics.counter import parser - -from .resources import get_log_files_for_collection_date -from . import daily_payloads - - -def process_daily_metric_job(job, robots_list, mmdb, track_errors=False): - log_files = get_log_files_for_collection_date( - collection=job.collection, - access_date=job.access_date, - ) - if not log_files: - raise RuntimeError(f"No log files found for {job.collection.acron3} {job.access_date}.") - - results = {} - summary = { - "log_files": len(log_files), - "input_log_hashes": sorted(log_file.hash for log_file in log_files if log_file.hash), - "lines_parsed": 0, - "valid_lines": 0, - "discarded_lines": 0, - } - - LogFile.objects.filter(pk__in=[log_file.pk for log_file in log_files]).update( - status=choices.LOG_FILE_STATUS_PARSING, - summary={}, - last_processed_line=0, - parse_heartbeat_at=timezone.now(), - updated=timezone.now(), - ) - LogFileDiscardedLine.objects.filter(log_file_id__in=[log_file.pk for log_file in log_files]).delete() - - heartbeat_interval_seconds = getattr(settings, "METRICS_PARSE_HEARTBEAT_INTERVAL_SECONDS", 30) - - for log_file in log_files: - log_parser, url_translator_manager = setup_parsing_environment( - log_file=log_file, - robots_list=robots_list, - mmdb=mmdb, - ) - line_count = 0 - valid_count = 0 - errors = [] - last_heartbeat_monotonic = monotonic() - - for line in log_parser.parse(): - line_count += 1 - if monotonic() - last_heartbeat_monotonic >= heartbeat_interval_seconds: - touch_parse_heartbeat(log_file, log_parser.stats.lines_parsed) - last_heartbeat_monotonic = monotonic() - - is_valid_line, error_obj = process_line( - results=results, - line=line, - utm=url_translator_manager, - log_file=log_file, - track_errors=track_errors, - ) - if is_valid_line: - valid_count += 1 - else: - summary["discarded_lines"] += 1 - if error_obj: - errors.append(error_obj) - - if errors: - LogFileDiscardedLine.objects.bulk_create(errors) - - summary["lines_parsed"] += line_count - summary["valid_lines"] += valid_count - log_file.summary = { - "parsing_completed": True, - "lines_parsed": line_count, - "valid_lines": valid_count, - } - log_file.last_processed_line = log_parser.stats.lines_parsed - log_file.parse_heartbeat_at = timezone.now() - log_file.save( - update_fields=[ - "summary", - "last_processed_line", - "parse_heartbeat_at", - "updated", - ] - ) - - documents = index_docs.convert_raw_results_to_index_documents(results) - storage_path = daily_payloads.build_daily_storage_path(job.collection, job.access_date) - payload = { - "collection": job.collection.acron3, - "access_date": job.access_date.isoformat(), - "input_log_hashes": summary["input_log_hashes"], - "documents": documents, - "summary": summary, - } - payload_hash = daily_payloads.write_payload(storage_path, payload) - - job.input_log_hashes = summary["input_log_hashes"] - job.storage_path = storage_path.as_posix() - job.payload_hash = payload_hash - job.summary = { - **summary, - "month_document_count": len(documents.get("month", {})), - "year_document_count": len(documents.get("year", {})), - } - job.save( - update_fields=[ - "input_log_hashes", - "storage_path", - "payload_hash", - "summary", - "updated", - ] - ) - - return payload - - -def setup_parsing_environment(log_file, robots_list, mmdb): - lp = log_handler.LogParser(mmdb_data=mmdb.data, robots_list=robots_list, output_mode="dict") - lp.logfile = log_file.path - - translator_class = None - for cld in CollectionLogDirectory.objects.filter(config__collection=log_file.collection): - if cld.path in log_file.path: - if cld.translator_class: - translator_class = parser.translator_class_name_to_obj(cld.translator_class) - break - - if not translator_class: - raise Exception(f"No URL translator class found for collection {log_file.collection}.") - - utm = url_translator.URLTranslationManager( - documents_metadata=Document.metadata(collection=log_file.collection), - sources_metadata=Source.metadata(collection=log_file.collection), - translator=translator_class, - ) - return lp, utm - - -def process_line(results, line, utm, log_file, track_errors=False): - try: - translated_url = utm.translate(line.get("url")) - except Exception as exc: - logging.error("Error translating URL %s: %s", line.get("url"), exc) - return False, None - - try: - item_access_data = access.extract_item_access_data(log_file.collection.acron3, translated_url) - except Exception as exc: - logging.error("Error extracting item access data from URL %s: %s", line.get("url"), exc) - return False, None - - ignore_utm_validation = not track_errors - is_valid, check_result = access.is_valid_item_access_data( - item_access_data, - utm, - ignore_utm_validation, - ) - - if not is_valid: - if track_errors: - error_code = check_result.get("code") - if error_code in { - "invalid_scielo_issn", - "invalid_source_id", - "invalid_pid_v3", - "invalid_pid_v2", - "invalid_pid_generic", - }: - tracker_error_type = ( - LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT - if "pid" in error_code - else LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE - ) - - return False, LogFileDiscardedLine.create( - log_file=log_file, - error_type=tracker_error_type, - message=check_result.get("message"), - data={"line": line, "item_access_data": item_access_data}, - save=False, - ) - - return False, None - - try: - access.update_results_with_item_access_data(results, item_access_data, line) - except Exception as exc: - logging.error("Error updating metrics results for URL %s: %s", line.get("url"), exc) - return False, None - - return True, None - - -def touch_parse_heartbeat(log_file, last_processed_line=None): - heartbeat_at = timezone.now() - update_kwargs = { - "parse_heartbeat_at": heartbeat_at, - "updated": heartbeat_at, - } - if last_processed_line is not None: - update_kwargs["last_processed_line"] = last_processed_line or 0 - log_file.last_processed_line = last_processed_line or 0 - LogFile.objects.filter(pk=log_file.pk).update(**update_kwargs) - log_file.parse_heartbeat_at = heartbeat_at - - -def is_stale_parsing_log(log_file, stale_after_minutes=60): - if log_file.status != choices.LOG_FILE_STATUS_PARSING: - return False - - if not log_file.parse_heartbeat_at: - return True - - cutoff = timezone.now() - timedelta(minutes=stale_after_minutes) - return log_file.parse_heartbeat_at < cutoff - - -def requeue_stale_parsing_log(log_file): - now = timezone.now() - LogFile.objects.filter(pk=log_file.pk).update( - status=choices.LOG_FILE_STATUS_ERROR, - parse_heartbeat_at=None, - updated=now, - ) - log_file.status = choices.LOG_FILE_STATUS_ERROR - log_file.parse_heartbeat_at = None diff --git a/metrics/services/parsing/__init__.py b/metrics/services/parsing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/services/parsing/environment.py b/metrics/services/parsing/environment.py new file mode 100644 index 0000000..dba5567 --- /dev/null +++ b/metrics/services/parsing/environment.py @@ -0,0 +1,58 @@ +from scielo_usage_counter.translator.books import URLTranslatorBooksSite +from scielo_usage_counter.translator.classic import URLTranslatorClassicSite +from scielo_usage_counter.translator.dataverse import URLTranslatorDataverseSite +from scielo_usage_counter.translator.opac import URLTranslatorOPACSite +from scielo_usage_counter.translator.opac_alpha import URLTranslatorOPACAlphaSite +from scielo_usage_counter.translator.preprints import URLTranslatorPreprintsSite + +from document.models import Document +from log_manager_config.models import CollectionLogDirectory +from scielo_usage_counter import log_handler, url_translator +from source.models import Source + + +def setup_parsing_environment(log_file, robots_list, mmdb): + log_parser = log_handler.LogParser( + mmdb_data=mmdb.data, + robots_list=robots_list, + output_mode="dict", + ) + log_parser.logfile = log_file.path + + translator_class = _get_log_file_translator_class(log_file) + if not translator_class: + raise Exception( + f"No URL translator class found for collection {log_file.collection}." + ) + + url_translator_manager = url_translator.URLTranslationManager( + documents_metadata=Document.metadata(collection=log_file.collection), + sources_metadata=Source.metadata(collection=log_file.collection), + translator=translator_class, + ) + return log_parser, url_translator_manager + + +def _get_log_file_translator_class(log_file): + for directory in CollectionLogDirectory.objects.filter( + config__collection=log_file.collection, + ): + if directory.path in log_file.path and directory.translator_class: + return _get_translator_class(directory.translator_class) + + return None + + +def _get_translator_class(name): + if not name or not isinstance(name, str): + return None + + translator_classes = { + "books": URLTranslatorBooksSite, + "classic": URLTranslatorClassicSite, + "dataverse": URLTranslatorDataverseSite, + "opac": URLTranslatorOPACSite, + "opac_alpha": URLTranslatorOPACAlphaSite, + "preprints": URLTranslatorPreprintsSite, + } + return translator_classes.get(name.lower()) diff --git a/metrics/services/parsing/job_payloads.py b/metrics/services/parsing/job_payloads.py new file mode 100644 index 0000000..fa30b3b --- /dev/null +++ b/metrics/services/parsing/job_payloads.py @@ -0,0 +1,158 @@ +from time import monotonic + +from django.conf import settings + +from log_manager.models import LogFile +from metrics.counter.indexing import converter as index_docs +from metrics.services import daily_payloads +from metrics.services.parsing.environment import setup_parsing_environment +from metrics.services.parsing.lines import process_line +from metrics.services.parsing.log_files import ( + clear_discarded_lines, + mark_log_file_completed, + mark_logs_as_parsing, + touch_parse_heartbeat, +) +from tracker.models import LogFileDiscardedLine + + +def build_daily_metric_job_payload(job, robots_list, mmdb, track_errors=False): + input_log_hashes = sorted(job.input_log_hashes or []) + log_files = _get_job_log_files(job, input_log_hashes) + results = {} + summary = _initial_summary(log_files, input_log_hashes) + + mark_logs_as_parsing(log_files) + clear_discarded_lines(log_files) + + for log_file in log_files: + log_summary = _parse_log_file_into_results( + log_file=log_file, + results=results, + robots_list=robots_list, + mmdb=mmdb, + track_errors=track_errors, + ) + _merge_log_summary(summary, log_summary) + + documents = index_docs.convert(results) + payload = _write_job_payload(job, documents, summary) + return payload + + +def _get_job_log_files(job, input_log_hashes): + if not input_log_hashes: + raise RuntimeError(f"Daily metric job {job.pk} has no input log hashes.") + + log_files = LogFile.for_collection_date_hashes( + collection=job.collection, + access_date=job.access_date, + log_hashes=input_log_hashes, + ) + found_hashes = {log_file.hash for log_file in log_files if log_file.hash} + missing_hashes = sorted(set(input_log_hashes) - found_hashes) + if missing_hashes: + raise RuntimeError( + f"Daily metric job {job.pk} is missing log files for " + f"{job.collection.acron3} {job.access_date}: " + f"{', '.join(missing_hashes)}." + ) + return log_files + + +def _initial_summary(log_files, input_log_hashes): + return { + "log_files": len(log_files), + "input_log_hashes": input_log_hashes, + "lines_parsed": 0, + "valid_lines": 0, + "discarded_lines": 0, + } + + +def _parse_log_file_into_results( + log_file, results, robots_list, mmdb, track_errors=False +): + log_parser, url_translator_manager = setup_parsing_environment( + log_file=log_file, + robots_list=robots_list, + mmdb=mmdb, + ) + heartbeat_interval_seconds = getattr( + settings, + "METRICS_PARSE_HEARTBEAT_INTERVAL_SECONDS", + 30, + ) + summary = { + "lines_parsed": 0, + "valid_lines": 0, + "discarded_lines": 0, + } + errors = [] + last_heartbeat_monotonic = monotonic() + + for line in log_parser.parse(): + summary["lines_parsed"] += 1 + if monotonic() - last_heartbeat_monotonic >= heartbeat_interval_seconds: + touch_parse_heartbeat(log_file, log_parser.stats.lines_parsed) + last_heartbeat_monotonic = monotonic() + + is_valid_line, error_obj = process_line( + results=results, + line=line, + utm=url_translator_manager, + log_file=log_file, + track_errors=track_errors, + ) + if is_valid_line: + summary["valid_lines"] += 1 + else: + summary["discarded_lines"] += 1 + if error_obj: + errors.append(error_obj) + + if errors: + LogFileDiscardedLine.objects.bulk_create(errors) + + mark_log_file_completed(log_file, log_parser, summary) + return summary + + +def _merge_log_summary(summary, log_summary): + summary["lines_parsed"] += log_summary["lines_parsed"] + summary["valid_lines"] += log_summary["valid_lines"] + summary["discarded_lines"] += log_summary["discarded_lines"] + + +def _write_job_payload(job, documents, summary): + storage_path = daily_payloads.build_daily_storage_path( + job.collection, + job.access_date, + ) + payload = { + "collection": job.collection.acron3, + "access_date": job.access_date.isoformat(), + "input_log_hashes": summary["input_log_hashes"], + "documents": documents, + "summary": summary, + } + payload_hash = daily_payloads.write_payload(storage_path, payload) + + job.input_log_hashes = summary["input_log_hashes"] + job.storage_path = storage_path.as_posix() + job.payload_hash = payload_hash + job.summary = { + **summary, + "month_document_count": len(documents.get("month", {})), + "year_document_count": len(documents.get("year", {})), + } + job.save( + update_fields=[ + "input_log_hashes", + "storage_path", + "payload_hash", + "summary", + "updated", + ] + ) + return payload diff --git a/metrics/services/parsing/lines.py b/metrics/services/parsing/lines.py new file mode 100644 index 0000000..f7a7f04 --- /dev/null +++ b/metrics/services/parsing/lines.py @@ -0,0 +1,96 @@ +import logging + +from metrics.counter.access import accumulation, extraction, validation +from tracker.choices import ( + LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, + LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, +) +from tracker.models import LogFileDiscardedLine + +TRACKED_VALIDATION_ERROR_CODES = { + "invalid_scielo_issn", + "invalid_source_id", + "invalid_pid_v3", + "invalid_pid_v2", + "invalid_pid_generic", +} + + +def process_line(results, line, utm, log_file, track_errors=False): + try: + translated_url = utm.translate(line.get("url")) + except Exception as exc: + logging.error("Error translating URL %s: %s", line.get("url"), exc) + return False, None + + try: + counter_access = extraction.extract( + log_file.collection.acron3, + translated_url, + ) + except Exception as exc: + logging.error( + "Error extracting COUNTER access from URL %s: %s", line.get("url"), exc + ) + return False, None + + ignore_utm_validation = not track_errors + is_valid, check_result = validation.is_valid( + counter_access, + utm, + ignore_utm_validation, + ) + + if not is_valid: + return _build_discarded_line_error( + track_errors=track_errors, + check_result=check_result, + log_file=log_file, + line=line, + counter_access=counter_access, + ) + + try: + accumulation.accumulate( + results, + counter_access, + line, + ) + except Exception as exc: + logging.error( + "Error updating metrics results for URL %s: %s", + line.get("url"), + exc, + ) + return False, None + + return True, None + + +def _build_discarded_line_error( + track_errors, + check_result, + log_file, + line, + counter_access, +): + if not track_errors: + return False, None + + error_code = check_result.get("code") + if error_code not in TRACKED_VALIDATION_ERROR_CODES: + return False, None + + tracker_error_type = ( + LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT + if "pid" in error_code + else LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE + ) + + return False, LogFileDiscardedLine.create( + log_file=log_file, + error_type=tracker_error_type, + message=check_result.get("message"), + data={"line": line, "item_access_data": counter_access}, + save=False, + ) diff --git a/metrics/services/parsing/log_files.py b/metrics/services/parsing/log_files.py new file mode 100644 index 0000000..5bfbfa9 --- /dev/null +++ b/metrics/services/parsing/log_files.py @@ -0,0 +1,78 @@ +from datetime import timedelta + +from django.utils import timezone + +from log_manager import choices +from log_manager.models import LogFile +from tracker.models import LogFileDiscardedLine + + +def mark_logs_as_parsing(log_files): + now = timezone.now() + LogFile.objects.filter(pk__in=[log_file.pk for log_file in log_files]).update( + status=choices.LOG_FILE_STATUS_PARSING, + summary={}, + last_processed_line=0, + parse_heartbeat_at=now, + updated=now, + ) + + +def clear_discarded_lines(log_files): + LogFileDiscardedLine.objects.filter( + log_file_id__in=[log_file.pk for log_file in log_files] + ).delete() + + +def mark_log_file_completed(log_file, log_parser, summary): + log_file.summary = { + "parsing_completed": True, + "lines_parsed": summary["lines_parsed"], + "valid_lines": summary["valid_lines"], + } + log_file.last_processed_line = log_parser.stats.lines_parsed + log_file.parse_heartbeat_at = timezone.now() + log_file.save( + update_fields=[ + "summary", + "last_processed_line", + "parse_heartbeat_at", + "updated", + ] + ) + + +def touch_parse_heartbeat(log_file, last_processed_line=None): + heartbeat_at = timezone.now() + update_kwargs = { + "parse_heartbeat_at": heartbeat_at, + "updated": heartbeat_at, + } + if last_processed_line is not None: + update_kwargs["last_processed_line"] = last_processed_line or 0 + log_file.last_processed_line = last_processed_line or 0 + + LogFile.objects.filter(pk=log_file.pk).update(**update_kwargs) + log_file.parse_heartbeat_at = heartbeat_at + + +def is_stale_parsing_log(log_file, stale_after_minutes=60): + if log_file.status != choices.LOG_FILE_STATUS_PARSING: + return False + + if not log_file.parse_heartbeat_at: + return True + + cutoff = timezone.now() - timedelta(minutes=stale_after_minutes) + return log_file.parse_heartbeat_at < cutoff + + +def requeue_stale_parsing_log(log_file): + now = timezone.now() + LogFile.objects.filter(pk=log_file.pk).update( + status=choices.LOG_FILE_STATUS_ERROR, + parse_heartbeat_at=None, + updated=now, + ) + log_file.status = choices.LOG_FILE_STATUS_ERROR + log_file.parse_heartbeat_at = None diff --git a/metrics/services/resources.py b/metrics/services/resources.py index dc31400..9ac4b99 100644 --- a/metrics/services/resources.py +++ b/metrics/services/resources.py @@ -1,16 +1,7 @@ import logging -from django.conf import settings - -from log_manager.models import LogFile from resources.models import MMDB, RobotUserAgent -from metrics import opensearch - - -def extract_celery_queue_name(collection_acronym): - return f"parse_{settings.COLLECTION_ACRON3_SIZE_MAP.get(collection_acronym, 'small')}" - def fetch_required_resources(robot_source=None): robots_list = RobotUserAgent.get_patterns(source=robot_source) @@ -28,27 +19,3 @@ def fetch_required_resources(robot_source=None): return None, None return robots_list, mmdb - - -def build_search_client(): - return opensearch.OpenSearchUsageClient( - settings.OPENSEARCH_URL, - settings.OPENSEARCH_BASIC_AUTH, - settings.OPENSEARCH_API_KEY, - settings.OPENSEARCH_VERIFY_CERTS, - ) - - -def get_log_files_for_collection_date(collection, access_date, status_filters=None): - queryset = ( - LogFile.objects.filter( - collection=collection, - date=access_date, - ) - .select_related("collection") - .order_by("path", "hash") - ) - if status_filters: - queryset = queryset.filter(status__in=status_filters) - - return list(queryset) diff --git a/metrics/services/resume.py b/metrics/services/resume.py new file mode 100644 index 0000000..48253a4 --- /dev/null +++ b/metrics/services/resume.py @@ -0,0 +1,258 @@ +import logging + +from django.utils import timezone + +from config.collections import get_collection_parse_queue +from core.utils.date_utils import get_date_obj, get_date_range_str +from log_manager import choices +from log_manager.models import LogFile +from metrics.models import DailyMetricJob +from metrics.services.jobs import ( + create_or_update_daily_metric_job, + release_stale_daily_metric_jobs, +) +from metrics.services.parsing.log_files import ( + is_stale_parsing_log, + requeue_stale_parsing_log, +) + + +def resume_daily_metric_jobs( + daily_metric_export_task, + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, + stale_after_minutes=60, + queue_name=None, + user_id=None, + username=None, + robots_source=None, +): + from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) + from_date_obj = get_date_obj(from_date) + until_date_obj = get_date_obj(until_date) + + released_stale_jobs = release_stale_daily_metric_jobs( + collections=collections, + from_date=from_date_obj, + until_date=until_date_obj, + stale_after_minutes=stale_after_minutes, + ) + resumed_jobs = _enqueue_resumable_daily_metric_jobs( + daily_metric_export_task=daily_metric_export_task, + collections=collections, + from_date_obj=from_date_obj, + until_date_obj=until_date_obj, + queue_name=queue_name, + user_id=user_id, + username=username, + robots_source=robots_source, + ) + + logging.info( + "Resumed daily metric jobs for %s day(s); released %s stale job(s) at %s.", + resumed_jobs, + released_stale_jobs, + timezone.now(), + ) + return { + "resumed_logs": resumed_jobs, + "resumed_jobs": resumed_jobs, + "released_stale_batches": released_stale_jobs, + "released_stale_jobs": released_stale_jobs, + } + + +def resume_stale_parsing_logs( + log_parsing_task, + collections=None, + batch_size=5000, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + stale_after_minutes=60, + max_log_files=None, + queue_name=None, + user_id=None, + username=None, + robots_source=None, +): + from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) + from_date_obj = get_date_obj(from_date) + until_date_obj = get_date_obj(until_date) + + resumed_logs = _requeue_matching_stale_logs( + collections=collections, + from_date_obj=from_date_obj, + until_date_obj=until_date_obj, + stale_after_minutes=stale_after_minutes, + max_log_files=max_log_files, + ) + _enqueue_log_parsing_retry( + log_parsing_task=log_parsing_task, + collections=collections, + batch_size=batch_size, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + max_log_files=max_log_files, + queue_name=queue_name, + user_id=user_id, + username=username, + robots_source=robots_source, + ) + return { + "stale_logs_marked_for_retry": resumed_logs, + "parse_logs_enqueued": True, + } + + +def _enqueue_resumable_daily_metric_jobs( + daily_metric_export_task, + collections, + from_date_obj, + until_date_obj, + queue_name, + user_id, + username, + robots_source, +): + resumed_jobs = 0 + for job in _get_resumable_daily_metric_jobs( + collections, from_date_obj, until_date_obj + ): + job = _refresh_job_inputs_from_retryable_logs(job) + if job is None or job.status == DailyMetricJob.STATUS_EXPORTED: + continue + + daily_metric_export_task.apply_async( + args=(job.pk, False, user_id, username, robots_source), + queue=queue_name or get_collection_parse_queue(job.collection.acron3), + ) + resumed_jobs += 1 + return resumed_jobs + + +def _get_resumable_daily_metric_jobs(collections, from_date_obj, until_date_obj): + queryset = ( + DailyMetricJob.objects.filter( + status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_ERROR], + access_date__gte=from_date_obj, + access_date__lte=until_date_obj, + ) + .select_related("collection") + .order_by("access_date", "collection__acron3") + ) + if collections: + queryset = queryset.filter(collection__acron3__in=collections) + return queryset + + +def _refresh_job_inputs_from_retryable_logs(job): + log_files = LogFile.for_collection_date( + collection=job.collection, + access_date=job.access_date, + status_filters=[ + choices.LOG_FILE_STATUS_QUEUED, + choices.LOG_FILE_STATUS_ERROR, + ], + ) + if log_files: + return create_or_update_daily_metric_job( + collection=job.collection, + access_date=job.access_date, + log_files=log_files, + ) + + if job.storage_path and job.payload_hash: + return job + + logging.warning( + "Skipping daily metric job %s: no queued/error logs or stored payload.", + job.pk, + ) + return None + + +def _requeue_matching_stale_logs( + collections, + from_date_obj, + until_date_obj, + stale_after_minutes, + max_log_files, +): + resumed_logs = 0 + for log_file in _get_parsing_logs(collections): + probably_date = _extract_date_from_validation_dict(log_file.validation) + if not _is_log_date_inside_range(probably_date, from_date_obj, until_date_obj): + continue + if not is_stale_parsing_log(log_file, stale_after_minutes=stale_after_minutes): + continue + + requeue_stale_parsing_log(log_file) + resumed_logs += 1 + if max_log_files and resumed_logs >= max_log_files: + break + return resumed_logs + + +def _get_parsing_logs(collections): + queryset = ( + LogFile.objects.filter(status=choices.LOG_FILE_STATUS_PARSING) + .select_related("collection") + .order_by("validation__probably_date", "path", "hash") + ) + if collections: + queryset = queryset.filter(collection__acron3__in=collections) + return queryset + + +def _is_log_date_inside_range(probably_date, from_date_obj, until_date_obj): + return probably_date and from_date_obj <= probably_date <= until_date_obj + + +def _enqueue_log_parsing_retry( + log_parsing_task, + collections, + batch_size, + track_errors, + from_date, + until_date, + max_log_files, + queue_name, + user_id, + username, + robots_source, +): + apply_kwargs = { + "kwargs": { + "collections": collections, + "include_logs_with_error": True, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": False, + "replace": False, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": None, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "robots_source": robots_source, + } + } + if queue_name: + apply_kwargs["queue"] = queue_name + log_parsing_task.apply_async(**apply_kwargs) + + +def _extract_date_from_validation_dict(validation): + try: + date_str = validation.get("probably_date") + return get_date_obj(date_str, "%Y-%m-%d") + except Exception as e: + logging.error(f"Failed to extract date from validation: {e}") + return None diff --git a/metrics/tasks/__init__.py b/metrics/tasks/__init__.py index f0c2d6a..e69de29 100644 --- a/metrics/tasks/__init__.py +++ b/metrics/tasks/__init__.py @@ -1,19 +0,0 @@ -from .parse import ( - task_parse_logs, - task_wait_parse_logs_wave, -) -from .process import ( - task_process_daily_metric_job, -) -from .resume import ( - task_resume_log_exports, - task_resume_stale_parsing_logs, -) -from .index import ( - task_create_index, - task_delete_index, - task_delete_documents_by_key, -) -from .cleanup import ( - task_cleanup_daily_payloads, -) diff --git a/metrics/tasks/daily_metric_exports.py b/metrics/tasks/daily_metric_exports.py new file mode 100644 index 0000000..5bd8a2f --- /dev/null +++ b/metrics/tasks/daily_metric_exports.py @@ -0,0 +1,22 @@ +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils.request_utils import _get_user +from metrics.services.daily_metric_exports import build_and_export_daily_metric_job + + +@celery_app.task(bind=True, name=_("[Metrics] Process Daily Job"), timelimit=-1) +def task_build_and_export_daily_metric_job( + self, + job_id, + track_errors=False, + user_id=None, + username=None, + robots_source=None, +): + _get_user(self.request, username=username, user_id=user_id) + return build_and_export_daily_metric_job( + job_id=job_id, + track_errors=track_errors, + robots_source=robots_source, + ) diff --git a/metrics/tasks/index.py b/metrics/tasks/index.py index 2635377..eea151f 100644 --- a/metrics/tasks/index.py +++ b/metrics/tasks/index.py @@ -4,14 +4,13 @@ from config import celery_app from core.utils.request_utils import _get_user - -from metrics.services.resources import build_search_client +from metrics.opensearch.client import OpenSearchUsageClient @celery_app.task(bind=True, name=_("[Metrics] Create Index"), timelimit=-1) def task_create_index(self, index_name, mappings=None, user_id=None, username=None): _get_user(self.request, username=username, user_id=user_id) - search_client = build_search_client() + search_client = OpenSearchUsageClient() try: if search_client.client.indices.exists(index=index_name): @@ -24,26 +23,10 @@ def task_create_index(self, index_name, mappings=None, user_id=None, username=No logging.error("Failed to create index %s: %s", index_name, exc) -@celery_app.task(bind=True, name=_("[Metrics] Delete Index"), timelimit=-1) -def task_delete_index(self, index_name, user_id=None, username=None): - _get_user(self.request, username=username, user_id=user_id) - search_client = build_search_client() - - try: - if not search_client.client.indices.exists(index=index_name): - logging.info("Index %s does not exist.", index_name) - return - - search_client.delete_index(index_name=index_name) - logging.info("Index %s deleted successfully.", index_name) - except Exception as exc: - logging.error("Failed to delete index %s: %s", index_name, exc) - - @celery_app.task(bind=True, name=_("[Metrics] Delete Documents by Key"), timelimit=-1) def task_delete_documents_by_key(self, index_name, data, user_id=None, username=None): _get_user(self.request, username=username, user_id=user_id) - search_client = build_search_client() + search_client = OpenSearchUsageClient() try: search_client.delete_documents_by_key(index_name=index_name, data=data) diff --git a/metrics/tasks/log_parsing.py b/metrics/tasks/log_parsing.py new file mode 100644 index 0000000..f9c45cb --- /dev/null +++ b/metrics/tasks/log_parsing.py @@ -0,0 +1,99 @@ +from django.utils.translation import gettext as _ + +from config import celery_app +from metrics.services import log_parsing_jobs +from metrics.tasks.daily_metric_exports import task_build_and_export_daily_metric_job + + +@celery_app.task( + bind=True, name=_("[Log Pipeline] 3. Parse Logs (Manual)"), timelimit=-1 +) +def task_enqueue_log_parsing_jobs( + self, + collections=None, + include_logs_with_error=True, + batch_size=5000, + max_log_files=None, + auto_reexecute=False, + replace=False, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + queue_name=None, + user_id=None, + username=None, + skip_log_hashes=None, + robots_source=None, +): + if replace: + raise ValueError( + "replace=True is not supported. Recompute requires deleting/recreating " + "the affected day or period first." + ) + + return log_parsing_jobs.enqueue_log_parsing_jobs( + daily_metric_export_task=task_build_and_export_daily_metric_job, + wait_log_parsing_wave_task=task_wait_log_parsing_wave, + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=skip_log_hashes, + robots_source=robots_source, + ) + + +@celery_app.task(bind=True, name=_("[Metrics] Wait Parse Logs Wave"), timelimit=-1) +def task_wait_log_parsing_wave( + self, + wave_job_ids=None, + collections=None, + include_logs_with_error=True, + batch_size=5000, + max_log_files=None, + auto_reexecute=False, + replace=False, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + queue_name=None, + user_id=None, + username=None, + skip_log_hashes=None, + poll_interval_seconds=log_parsing_jobs.AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, + robots_source=None, + wave_log_hashes=None, +): + return log_parsing_jobs.wait_log_parsing_wave( + log_parsing_task=task_enqueue_log_parsing_jobs, + wait_log_parsing_wave_task=task_wait_log_parsing_wave, + wave_job_ids=wave_job_ids, + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=skip_log_hashes, + poll_interval_seconds=poll_interval_seconds, + robots_source=robots_source, + wave_log_hashes=wave_log_hashes, + ) diff --git a/metrics/tasks/parse.py b/metrics/tasks/parse.py deleted file mode 100644 index ad3398c..0000000 --- a/metrics/tasks/parse.py +++ /dev/null @@ -1,295 +0,0 @@ -import logging - -from django.utils.translation import gettext as _ - -from config import celery_app -from core.utils.date_utils import get_date_obj, get_date_range_str -from core.utils.request_utils import _get_user -from collection.models import Collection -from log_manager import choices -from log_manager.models import LogFile -from metrics.models import DailyMetricJob - -from metrics.services.resources import extract_celery_queue_name, get_log_files_for_collection_date -from metrics.services.jobs import create_or_update_daily_metric_job -from metrics.tasks.process import task_process_daily_metric_job - -AUTO_REEXECUTE_POLL_INTERVAL_SECONDS = 30 - - -@celery_app.task(bind=True, name=_("[Log Pipeline] 3. Parse Logs (Manual)"), timelimit=-1) -def task_parse_logs( - self, - collections=None, - include_logs_with_error=True, - batch_size=5000, - max_log_files=None, - auto_reexecute=False, - replace=False, - track_errors=False, - from_date=None, - until_date=None, - days_to_go_back=None, - queue_name=None, - user_id=None, - username=None, - skip_log_hashes=None, - robots_source=None, -): - if replace: - raise ValueError( - "replace=True is not supported. Recompute requires deleting/recreating " - "the affected day or period first." - ) - - from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) - from_date_obj = get_date_obj(from_date) - until_date_obj = get_date_obj(until_date) - enqueued_jobs = 0 - reached_max_log_files = False - enqueued_wave_job_ids = [] - claimed_status_filters = list(_build_parse_status_filters(include_logs_with_error)) - skip_log_hashes = set(skip_log_hashes or []) - - for collection in collections or Collection.acron3_list(): - collection_obj = Collection.objects.filter(acron3=collection).first() - if not collection_obj: - continue - - access_dates = _find_access_dates( - collection=collection_obj, - from_date=from_date, - until_date=until_date, - from_date_obj=from_date_obj, - until_date_obj=until_date_obj, - status_filters=claimed_status_filters, - skip_log_hashes=skip_log_hashes, - ) - - for access_date in access_dates: - log_files = get_log_files_for_collection_date( - collection=collection_obj, - access_date=access_date, - status_filters=claimed_status_filters, - ) - log_files = [log_file for log_file in log_files if log_file.hash not in skip_log_hashes] - if not log_files: - continue - - job = create_or_update_daily_metric_job( - collection=collection_obj, - access_date=access_date, - log_files=log_files, - ) - if job.status == DailyMetricJob.STATUS_EXPORTED: - continue - - task_process_daily_metric_job.apply_async( - args=(job.pk, track_errors, user_id, username, robots_source), - queue=queue_name or extract_celery_queue_name(collection), - ) - enqueued_wave_job_ids.append(job.pk) - enqueued_jobs += 1 - if max_log_files and enqueued_jobs >= max_log_files: - reached_max_log_files = True - break - - if reached_max_log_files: - break - - auto_reexecution_enqueued = _schedule_parse_logs_reexecution( - should_reexecute=auto_reexecute and reached_max_log_files and bool(enqueued_wave_job_ids), - wave_job_ids=enqueued_wave_job_ids, - collections=collections, - include_logs_with_error=include_logs_with_error, - batch_size=batch_size, - max_log_files=max_log_files, - auto_reexecute=auto_reexecute, - replace=replace, - track_errors=track_errors, - from_date=from_date, - until_date=until_date, - days_to_go_back=days_to_go_back, - queue_name=queue_name, - user_id=user_id, - username=username, - skip_log_hashes=sorted(skip_log_hashes), - robots_source=robots_source, - ) - - return { - "enqueued_logs": enqueued_jobs, - "enqueued_jobs": enqueued_jobs, - "reached_max_log_files": reached_max_log_files, - "auto_reexecution_enqueued": auto_reexecution_enqueued, - } - - -def _build_parse_status_filters(include_logs_with_error): - status_filters = [choices.LOG_FILE_STATUS_QUEUED] - if include_logs_with_error: - status_filters.append(choices.LOG_FILE_STATUS_ERROR) - return tuple(status_filters) - - -def _find_access_dates( - collection, - from_date, - until_date, - from_date_obj, - until_date_obj, - status_filters, - skip_log_hashes, -): - date_queryset = ( - LogFile.objects.filter( - status__in=status_filters, - collection=collection, - date__gte=from_date_obj, - date__lte=until_date_obj, - ) - .exclude(hash__in=skip_log_hashes) - .values_list("date", flat=True) - .distinct() - .order_by("date") - ) - - access_dates = set() - for value in list(date_queryset): - access_date = value if hasattr(value, "isoformat") else get_date_obj(value) - if access_date and from_date_obj <= access_date <= until_date_obj: - access_dates.add(access_date) - return sorted(access_dates) - - -def _schedule_parse_logs_reexecution( - should_reexecute, - wave_job_ids, - collections, - include_logs_with_error, - batch_size, - max_log_files, - auto_reexecute, - replace, - track_errors, - from_date, - until_date, - days_to_go_back, - queue_name, - user_id, - username, - skip_log_hashes, - robots_source=None, -): - if not should_reexecute: - return False - - kwargs = { - "wave_job_ids": wave_job_ids, - "collections": collections, - "include_logs_with_error": include_logs_with_error, - "batch_size": batch_size, - "max_log_files": max_log_files, - "auto_reexecute": auto_reexecute, - "replace": replace, - "track_errors": track_errors, - "from_date": from_date, - "until_date": until_date, - "days_to_go_back": days_to_go_back, - "queue_name": queue_name, - "user_id": user_id, - "username": username, - "skip_log_hashes": skip_log_hashes, - "poll_interval_seconds": AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, - } - if robots_source is not None: - kwargs["robots_source"] = robots_source - - apply_kwargs = {"kwargs": kwargs} - if queue_name: - apply_kwargs["queue"] = queue_name - task_wait_parse_logs_wave.apply_async(**apply_kwargs) - return True - - -@celery_app.task(bind=True, name=_("[Metrics] Wait Parse Logs Wave"), timelimit=-1) -def task_wait_parse_logs_wave( - self, - wave_job_ids=None, - collections=None, - include_logs_with_error=True, - batch_size=5000, - max_log_files=None, - auto_reexecute=False, - replace=False, - track_errors=False, - from_date=None, - until_date=None, - days_to_go_back=None, - queue_name=None, - user_id=None, - username=None, - skip_log_hashes=None, - poll_interval_seconds=AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, - robots_source=None, - wave_log_hashes=None, -): - wave_job_ids = wave_job_ids or wave_log_hashes or [] - if DailyMetricJob.objects.filter( - pk__in=wave_job_ids, - status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_EXPORTING], - ).exists(): - kwargs = { - "wave_job_ids": wave_job_ids, - "collections": collections, - "include_logs_with_error": include_logs_with_error, - "batch_size": batch_size, - "max_log_files": max_log_files, - "auto_reexecute": auto_reexecute, - "replace": replace, - "track_errors": track_errors, - "from_date": from_date, - "until_date": until_date, - "days_to_go_back": days_to_go_back, - "queue_name": queue_name, - "user_id": user_id, - "username": username, - "skip_log_hashes": skip_log_hashes, - "poll_interval_seconds": poll_interval_seconds, - } - if robots_source is not None: - kwargs["robots_source"] = robots_source - - apply_kwargs = { - "kwargs": kwargs, - "countdown": poll_interval_seconds, - } - if queue_name: - apply_kwargs["queue"] = queue_name - task_wait_parse_logs_wave.apply_async(**apply_kwargs) - return {"wave_completed": False, "reexecution_enqueued": False} - - kwargs = { - "collections": collections, - "include_logs_with_error": include_logs_with_error, - "batch_size": batch_size, - "max_log_files": max_log_files, - "auto_reexecute": auto_reexecute, - "replace": replace, - "track_errors": track_errors, - "from_date": from_date, - "until_date": until_date, - "days_to_go_back": days_to_go_back, - "queue_name": queue_name, - "user_id": user_id, - "username": username, - "skip_log_hashes": skip_log_hashes, - } - if robots_source is not None: - kwargs["robots_source"] = robots_source - - apply_kwargs = {"kwargs": kwargs} - if queue_name: - apply_kwargs["queue"] = queue_name - task_parse_logs.apply_async(**apply_kwargs) - return {"wave_completed": True, "reexecution_enqueued": True} diff --git a/metrics/tasks/process.py b/metrics/tasks/process.py deleted file mode 100644 index ecdc7a5..0000000 --- a/metrics/tasks/process.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging - -from django.utils.translation import gettext as _ - -from config import celery_app -from core.utils.request_utils import _get_user -from metrics.models import DailyMetricJob - -from metrics.services.jobs import acquire_daily_metric_job, mark_daily_metric_job_exported, mark_daily_metric_job_failed -from metrics.services.export import export_daily_metric_payload, load_daily_metric_payload -from metrics.services.resources import build_search_client, fetch_required_resources -from metrics.services.parser import process_daily_metric_job - - -@celery_app.task(bind=True, name=_("[Metrics] Process Daily Job"), timelimit=-1) -def task_process_daily_metric_job( - self, - job_id, - track_errors=False, - user_id=None, - username=None, - robots_source=None, -): - user = _get_user(self.request, username=username, user_id=user_id) - - try: - job = acquire_daily_metric_job(job_id) - except DailyMetricJob.DoesNotExist: - logging.error("Daily metric job %s does not exist.", job_id) - return - - if not job: - return - - try: - payload = load_daily_metric_payload(job) - if payload is None or not job.payload_hash: - robots_list, mmdb = fetch_required_resources(robot_source=robots_source) - if not robots_list or not mmdb: - raise RuntimeError("Required parsing resources are not available.") - payload = process_daily_metric_job( - job=job, - robots_list=robots_list, - mmdb=mmdb, - track_errors=track_errors, - ) - job.refresh_from_db() - - search_client = build_search_client() - if not search_client.ping(): - raise RuntimeError("OpenSearch client is not available.") - - export_daily_metric_payload( - search_client=search_client, - job=job, - payload=payload, - ) - except Exception as exc: - logging.error("Failed to process daily metric job %s: %s", job_id, exc) - mark_daily_metric_job_failed(job, exc) - return - - mark_daily_metric_job_exported(job, user=user) diff --git a/metrics/tasks/resume.py b/metrics/tasks/resume.py index c0fe705..cd1c76a 100644 --- a/metrics/tasks/resume.py +++ b/metrics/tasks/resume.py @@ -1,22 +1,10 @@ -import logging - -from django.utils import timezone from django.utils.translation import gettext as _ from config import celery_app -from core.utils.date_utils import get_date_obj, get_date_range_str from core.utils.request_utils import _get_user -from log_manager import choices -from log_manager.models import LogFile -from metrics.models import DailyMetricJob - -from metrics.services.jobs import create_or_update_daily_metric_job, release_stale_daily_metric_jobs -from metrics.services.resources import extract_celery_queue_name, get_log_files_for_collection_date -from metrics.services.parser import is_stale_parsing_log, requeue_stale_parsing_log -from metrics.counter import parser - -from .parse import task_parse_logs -from .process import task_process_daily_metric_job +from metrics.services import resume +from metrics.tasks.daily_metric_exports import task_build_and_export_daily_metric_job +from metrics.tasks.log_parsing import task_enqueue_log_parsing_jobs @celery_app.task(bind=True, name=_("[Metrics] Resume Log Exports"), timelimit=-1) @@ -33,69 +21,18 @@ def task_resume_log_exports( robots_source=None, ): _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) - from_date_obj = get_date_obj(from_date) - until_date_obj = get_date_obj(until_date) - - released_stale_jobs = release_stale_daily_metric_jobs( + return resume.resume_daily_metric_jobs( + daily_metric_export_task=task_build_and_export_daily_metric_job, collections=collections, - from_date=from_date_obj, - until_date=until_date_obj, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, stale_after_minutes=stale_after_minutes, + queue_name=queue_name, + user_id=user_id, + username=username, + robots_source=robots_source, ) - queryset = DailyMetricJob.objects.filter( - status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_ERROR], - access_date__gte=from_date_obj, - access_date__lte=until_date_obj, - ).select_related("collection").order_by("access_date", "collection__acron3") - if collections: - queryset = queryset.filter(collection__acron3__in=collections) - - resumed_jobs = 0 - for job in queryset: - log_files = get_log_files_for_collection_date( - collection=job.collection, - access_date=job.access_date, - status_filters=[ - choices.LOG_FILE_STATUS_QUEUED, - choices.LOG_FILE_STATUS_ERROR, - ], - ) - if log_files: - job = create_or_update_daily_metric_job( - collection=job.collection, - access_date=job.access_date, - log_files=log_files, - ) - elif not (job.storage_path and job.payload_hash): - logging.warning( - "Skipping daily metric job %s: no queued/error logs or stored payload.", - job.pk, - ) - continue - - if job.status == DailyMetricJob.STATUS_EXPORTED: - continue - - task_process_daily_metric_job.apply_async( - args=(job.pk, False, user_id, username, robots_source), - queue=queue_name or extract_celery_queue_name(job.collection.acron3), - ) - resumed_jobs += 1 - - logging.info( - "Resumed daily metric jobs for %s day(s); released %s stale job(s) at %s.", - resumed_jobs, - released_stale_jobs, - timezone.now(), - ) - return { - "resumed_logs": resumed_jobs, - "resumed_jobs": resumed_jobs, - "released_stale_batches": released_stale_jobs, - "released_stale_jobs": released_stale_jobs, - } @celery_app.task(bind=True, name=_("[Metrics] Resume Stale Parsing Logs"), timelimit=-1) @@ -114,53 +51,18 @@ def task_resume_stale_parsing_logs( username=None, robots_source=None, ): - from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) - from_date_obj = get_date_obj(from_date) - until_date_obj = get_date_obj(until_date) - - queryset = ( - LogFile.objects.filter(status=choices.LOG_FILE_STATUS_PARSING) - .select_related("collection") - .order_by("validation__probably_date", "path", "hash") + return resume.resume_stale_parsing_logs( + log_parsing_task=task_enqueue_log_parsing_jobs, + collections=collections, + batch_size=batch_size, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + stale_after_minutes=stale_after_minutes, + max_log_files=max_log_files, + queue_name=queue_name, + user_id=user_id, + username=username, + robots_source=robots_source, ) - if collections: - queryset = queryset.filter(collection__acron3__in=collections) - - resumed_logs = 0 - for log_file in queryset: - probably_date = parser.extract_date_from_validation_dict(log_file.validation) - if not probably_date or probably_date < from_date_obj or probably_date > until_date_obj: - continue - if not is_stale_parsing_log(log_file, stale_after_minutes=stale_after_minutes): - continue - - requeue_stale_parsing_log(log_file) - resumed_logs += 1 - if max_log_files and resumed_logs >= max_log_files: - break - - apply_kwargs = { - "kwargs": { - "collections": collections, - "include_logs_with_error": True, - "batch_size": batch_size, - "max_log_files": max_log_files, - "auto_reexecute": False, - "replace": False, - "track_errors": track_errors, - "from_date": from_date, - "until_date": until_date, - "days_to_go_back": None, - "queue_name": queue_name, - "user_id": user_id, - "username": username, - "robots_source": robots_source, - } - } - if queue_name: - apply_kwargs["queue"] = queue_name - task_parse_logs.apply_async(**apply_kwargs) - return { - "stale_logs_marked_for_retry": resumed_logs, - "parse_logs_enqueued": True, - } diff --git a/metrics/tests/conftest.py b/metrics/tests/conftest.py new file mode 100644 index 0000000..0260026 --- /dev/null +++ b/metrics/tests/conftest.py @@ -0,0 +1,60 @@ +from datetime import date +from pathlib import Path + +import pytest + +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture +def books_collection(db): + return Collection.objects.create(acron3="books", acron2="bk") + + +@pytest.fixture +def scl_collection(db): + return Collection.objects.create(acron3="scl", acron2="sc") + + +@pytest.fixture +def preprints_collection(db): + return Collection.objects.create(acron3="preprints", acron2="pp") + + +@pytest.fixture +def data_collection(db): + return Collection.objects.create(acron3="data", acron2="dt") + + +@pytest.fixture +def robots_list(): + path = FIXTURES_DIR / "counter-robots.txt" + return path.read_text().splitlines() + + +@pytest.fixture +def mmdb_data(): + path = FIXTURES_DIR / "map.mmdb" + return path.read_bytes() + + +@pytest.fixture +def log_file_factory(db): + def _create(collection, hash_value, **kwargs): + defaults = { + "path": f"/tmp/{hash_value}.log.gz", + "stat_result": {}, + "status": choices.LOG_FILE_STATUS_QUEUED, + "date": date(2024, 1, 15), + "validation": {"probably_date": "2024-01-15"}, + } + defaults.update(kwargs) + return LogFile.objects.create( + collection=collection, hash=hash_value, **defaults + ) + + return _create diff --git a/metrics/tests/counter/__init__.py b/metrics/tests/counter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/counter/access/__init__.py b/metrics/tests/counter/access/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/counter/access/test_accumulation.py b/metrics/tests/counter/access/test_accumulation.py new file mode 100644 index 0000000..ccf9044 --- /dev/null +++ b/metrics/tests/counter/access/test_accumulation.py @@ -0,0 +1,183 @@ +import unittest +from datetime import datetime + +from scielo_usage_counter.values import ( + CONTENT_TYPE_FULL_TEXT, + DEFAULT_SCIELO_ISSN, + MEDIA_FORMAT_HTML, + MEDIA_FORMAT_PDF, +) + +from metrics.counter.access import accumulation + + +class TestAccumulation(unittest.TestCase): + def _book_counter_access(self, **overrides): + base = { + "collection": "books", + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:Q7GTD", + "title_pid_generic": "BOOK:Q7GTD", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2023", + "document_title": "Book Title", + "source_main_title": "Book Title", + "source_subject_area_capes": [], + "source_subject_area_wos": [], + "source_acronym": None, + "source_publisher_name": ["SciELO Books"], + } + base.update(overrides) + return base + + def _line(self, **overrides): + base = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "local_datetime": datetime(2024, 1, 15, 10, 0, 5), + } + base.update(overrides) + return base + + def test_stores_source_and_periods(self): + results = {} + accumulation.accumulate(results, self._book_counter_access(), self._line()) + + self.assertEqual(len(results), 1) + result = next(iter(results.values())) + self.assertEqual(result["source"]["source_type"], "book") + self.assertEqual(result["source"]["source_id"], "q7gtd") + self.assertEqual(result["source"]["main_title"], "Book Title") + self.assertEqual(result["access_date"], "2024-01-15") + self.assertEqual(result["access_month"], "202401") + self.assertEqual(result["access_year"], "2024") + self.assertEqual(result["access_country_code"], "BR") + self.assertEqual(result["content_language"], "en") + self.assertEqual(result["title_pid_generic"], "BOOK:Q7GTD") + self.assertEqual(result["document"], {"title": "Book Title"}) + self.assertIn("user_session_id", result) + + def test_rejects_invalid_local_datetime(self): + results = {} + with self.assertRaises(ValueError): + accumulation.accumulate( + results, + self._book_counter_access(), + self._line(local_datetime=None), + ) + self.assertEqual(results, {}) + + def test_does_not_expand_book_into_segments(self): + results = {} + counter_access = self._book_counter_access( + source_id="c2248", + pid_generic="BOOK:C2248", + title_pid_generic="BOOK:C2248", + segment_pid_generics=[ + "BOOK:C2248/CHAPTER:00", + "BOOK:C2248/CHAPTER:01", + "BOOK:C2248/CHAPTER:02", + ], + media_format=MEDIA_FORMAT_PDF, + media_language="pt", + publication_year="2018", + source_main_title="C2248 Book", + ) + accumulation.accumulate(results, counter_access, self._line()) + self.assertEqual(len(results), 1) + result = list(results.values())[0] + self.assertEqual(result["pid_generic"], "BOOK:C2248") + + def test_double_click_filter_uses_url_bucket_for_same_item(self): + results = {} + counter_access = self._book_counter_access( + source_id="c2248", + pid_generic="BOOK:C2248/CHAPTER:03", + title_pid_generic="BOOK:C2248", + media_language="pt", + publication_year="2018", + source_main_title="C2248 Book", + ) + + accumulation.accumulate( + results, + counter_access, + self._line( + local_datetime=datetime(2024, 1, 15, 10, 0, 5), + url="/id/c2248/03", + ), + ) + accumulation.accumulate( + results, + counter_access, + self._line( + local_datetime=datetime(2024, 1, 15, 10, 0, 20), + url="https://books.scielo.org/id/c2248/epub/03.html?x=1", + ), + ) + + raw = next(iter(results.values())) + self.assertEqual( + set(raw["click_timestamps_by_url"]), + {"/id/c2248/03", "/id/c2248/epub/03.html"}, + ) + + def test_same_url_within_window_produces_single_url_bucket(self): + results = {} + counter_access = self._book_counter_access( + source_id="c2248", + pid_generic="BOOK:C2248/CHAPTER:03", + title_pid_generic="BOOK:C2248", + media_language="pt", + publication_year="2018", + source_main_title="C2248 Book", + ) + + accumulation.accumulate( + results, + counter_access, + self._line( + local_datetime=datetime(2024, 1, 15, 10, 0, 5), + url="/id/c2248/03?from=search", + ), + ) + accumulation.accumulate( + results, + counter_access, + self._line( + local_datetime=datetime(2024, 1, 15, 10, 0, 20), + url="/id/c2248/03?from=search", + ), + ) + + raw = next(iter(results.values())) + self.assertEqual( + raw["click_timestamps_by_url"], + {"/id/c2248/03": {"00:05": 1, "00:20": 1}}, + ) + + def test_generates_session_id_from_client_ip_datetime(self): + results = {} + accumulation.accumulate(results, self._book_counter_access(), self._line()) + result = next(iter(results.values())) + self.assertEqual( + result["user_session_id"], "browser|1.0|127.0.0.1|2024-01-15|10" + ) + + def test_ipv6_address_is_accepted(self): + results = {} + accumulation.accumulate( + results, + self._book_counter_access(), + self._line(ip_address="2001:4860:7:1103::"), + ) + result = next(iter(results.values())) + self.assertIn("2001:4860:7:1103::", result["user_session_id"]) diff --git a/metrics/tests/counter/access/test_extraction.py b/metrics/tests/counter/access/test_extraction.py new file mode 100644 index 0000000..e89705c --- /dev/null +++ b/metrics/tests/counter/access/test_extraction.py @@ -0,0 +1,208 @@ +import unittest + +from scielo_usage_counter.values import ( + CONTENT_TYPE_ABSTRACT, + CONTENT_TYPE_FULL_TEXT, + DEFAULT_SCIELO_ISSN, + MEDIA_FORMAT_HTML, + MEDIA_FORMAT_PDF, +) + +from metrics.counter.access import extraction + + +class TestExtraction(unittest.TestCase): + def test_normalizes_source_fields_for_journal(self): + data = extraction.extract( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2024", + "journal_main_title": "Journal Title", + "journal_subject_area_capes": ["Health Sciences"], + "journal_subject_area_wos": ["Medicine"], + "journal_acronym": "testjou", + "journal_publisher_name": ["SciELO"], + }, + ) + + self.assertEqual(data["source_type"], "journal") + self.assertEqual(data["source_id"], "1234-5678") + self.assertEqual(data["source_main_title"], "Journal Title") + self.assertEqual(data["source_acronym"], "testjou") + + def test_normalizes_source_fields_for_books(self): + data = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "q7gtd", + "document_type": "chapter", + "book_id": "q7gtd", + "book_title": "Book Title", + "title_pid_generic": "book:q7gtd", + "pid_generic": "book:q7gtd/chapter:03", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2023", + }, + ) + + self.assertEqual(data["source_type"], "book") + self.assertEqual(data["source_id"], "q7gtd") + self.assertEqual(data["scielo_issn"], DEFAULT_SCIELO_ISSN) + self.assertEqual(data["source_main_title"], "Book Title") + self.assertEqual(data["title_pid_generic"], "BOOK:Q7GTD") + + def test_preserves_access_url_and_free_to_read(self): + data = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "c2248", + "document_type": "book", + "book_id": "c2248", + "book_title": "Book Title", + "title_pid_generic": "book:c2248", + "pid_generic": "book:c2248", + "media_language": "pt", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_url": "/id/c2248/pdf/freitas-9788599662830.pdf", + "source_access_type": "free_to_read", + }, + ) + + self.assertEqual(data["access_url"], "/id/c2248/pdf/freitas-9788599662830.pdf") + self.assertEqual(data["counter_access_type"], "Free_To_Read") + + def test_tolerates_malformed_media_language(self): + data = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "q7gtd", + "document_type": "book", + "book_id": "q7gtd", + "pid_generic": "book:q7gtd", + "media_language": "'", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + + self.assertEqual(data["media_language"], "un") + + def test_sets_document_title_by_type(self): + chapter = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "q7gtd", + "document_type": "chapter", + "book_id": "q7gtd", + "chapter_id": "03", + "pid_generic": "book:q7gtd/chapter:03", + "book_title": "Book Title", + "chapter_title": "Chapter Title", + "media_format": MEDIA_FORMAT_HTML, + "media_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + book = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "q7gtd", + "document_type": "book", + "book_id": "q7gtd", + "pid_generic": "book:q7gtd", + "book_title": "Book Title", + "media_format": MEDIA_FORMAT_HTML, + "media_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + article = extraction.extract( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "article_title": "Article Title", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + + self.assertEqual(chapter["document_title"], "Chapter Title") + self.assertEqual(book["document_title"], "Book Title") + self.assertEqual(article["document_title"], "Article Title") + + def test_normalizes_collection_document_types(self): + preprint = extraction.extract( + "preprints", + { + "pid_generic": "10.1590/SciELOPreprints.1234", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + dataset = extraction.extract( + "data", + { + "pid_generic": "10.48331/scielodata.abc123", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_ABSTRACT, + }, + ) + article = extraction.extract( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + + self.assertEqual(preprint["source_type"], "preprint_server") + self.assertEqual(preprint["document_type"], "preprint") + self.assertEqual(dataset["source_type"], "data_repository") + self.assertEqual(dataset["document_type"], "dataset") + self.assertEqual(article["source_type"], "journal") + self.assertEqual(article["document_type"], "article") + + def test_empty_or_none_translated_url_returns_empty_dict(self): + self.assertEqual(extraction.extract("scl", None), {}) + self.assertEqual(extraction.extract("scl", {}), {}) + + def test_counter_access_type_defaults_to_open(self): + data = extraction.extract( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "abc123", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + self.assertEqual(data["counter_access_type"], "Open") + + def test_commercial_access_type_maps_to_controlled(self): + data = extraction.extract( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "abc123", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "source_access_type": "commercial", + }, + ) + self.assertEqual(data["counter_access_type"], "Controlled") diff --git a/metrics/tests/counter/access/test_validation.py b/metrics/tests/counter/access/test_validation.py new file mode 100644 index 0000000..83f030c --- /dev/null +++ b/metrics/tests/counter/access/test_validation.py @@ -0,0 +1,169 @@ +import unittest + +from scielo_usage_counter.values import ( + CONTENT_TYPE_ABSTRACT, + CONTENT_TYPE_FULL_TEXT, + CONTENT_TYPE_UNDEFINED, + DEFAULT_SCIELO_ISSN, + MEDIA_FORMAT_HTML, + MEDIA_FORMAT_PDF, + MEDIA_FORMAT_UNDEFINED, +) + +from metrics.counter.access import validation + + +class TestValidation(unittest.TestCase): + def test_valid_journal_access(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertTrue(result) + + def test_valid_book_source(self): + data = { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_generic": "BOOK:Q7GTD", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertTrue(result) + + def test_undefined_media_format_is_invalid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_UNDEFINED, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_undefined_content_type_is_invalid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_UNDEFINED, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_missing_all_pids_is_invalid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "", + "pid_v3": "", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_html_format_is_valid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertTrue(result) + + def test_abstract_content_type_is_valid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_ABSTRACT, + } + result, _ = validation.is_valid(data) + self.assertTrue(result) + + def test_dataset_without_source_or_language_is_valid(self): + data = { + "document_type": "dataset", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "DOI:10.48331/SCIELODATA.JLMAIY", + "media_language": "un", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_ABSTRACT, + } + result, _ = validation.is_valid(data) + self.assertTrue(result) + + def test_missing_media_language_is_invalid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_missing_scielo_issn_for_article_is_invalid(self): + data = { + "scielo_issn": "", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_preprint_requires_pid_generic(self): + data = { + "document_type": "preprint", + "pid_v2": None, + "pid_v3": "abc123", + "pid_generic": "", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_chapter_requires_source_id(self): + data = { + "document_type": "chapter", + "source_id": "", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_generic": "BOOK:Q7GTD/CHAPTER:03", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = validation.is_valid(data) + self.assertFalse(result) + + def test_non_dict_input_is_invalid(self): + result, check = validation.is_valid(None) + self.assertFalse(result) + self.assertEqual(check["code"], "invalid_format") diff --git a/metrics/tests/counter/indexing/__init__.py b/metrics/tests/counter/indexing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/counter/indexing/test_converter.py b/metrics/tests/counter/indexing/test_converter.py new file mode 100644 index 0000000..184e871 --- /dev/null +++ b/metrics/tests/counter/indexing/test_converter.py @@ -0,0 +1,472 @@ +import unittest + +from scielo_usage_counter.values import ( + CONTENT_TYPE_ABSTRACT, + CONTENT_TYPE_FULL_TEXT, + DEFAULT_SCIELO_ISSN, + MEDIA_FORMAT_HTML, +) + +from metrics.counter.indexing import converter as index_docs + + +class TestConverter(unittest.TestCase): + def test_creates_month_and_year_views_for_book_chapter(self): + data = { + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|browser|1.0|127.0.0.1|BR|en|html|full_text": { + "collection": "books", + "source_key": "q7gtd", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:Q7GTD/CHAPTER:03", + "document": {"title": "Chapter Title"}, + "title_pid_generic": "BOOK:Q7GTD", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "main_title": "Book Title", + "identifiers": {"book_id": "q7gtd", "isbn": "9788578791889"}, + "city": "Sao Paulo", + "country": "BR", + "subject_area_capes": [], + "subject_area_wos": [], + "acronym": None, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2023", + } + } + + metrics_data = index_docs.convert(data) + + self.assertEqual(set(metrics_data.keys()), {"month", "year"}) + self.assertEqual(len(metrics_data["month"]), 2) + self.assertEqual(len(metrics_data["year"]), 2) + + month_item = metrics_data["month"][ + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|2024-01|Open|Regular|2023" + ] + self.assertEqual(month_item["access"], {"month": "2024-01"}) + self.assertIn("daily_metrics", month_item) + self.assertNotIn("access_country_code", month_item) + self.assertNotIn("content_language", month_item) + self.assertEqual(month_item["document"]["id"], "BOOK:Q7GTD/CHAPTER:03") + self.assertEqual(month_item["document"]["type"], "chapter") + self.assertEqual(month_item["document"]["title"], "Chapter Title") + self.assertEqual(month_item["document"]["parent_id"], "BOOK:Q7GTD") + self.assertEqual(month_item["document"]["publication_year"], "2023") + self.assertEqual(month_item["document"]["identifiers"]["book_id"], "q7gtd") + self.assertEqual(month_item["document"]["identifiers"]["chapter_id"], "03") + self.assertEqual(month_item["document"]["identifiers"]["isbn"], "9788578791889") + self.assertNotIn("pid_generic", month_item["document"]["identifiers"]) + self.assertEqual(month_item["counter"]["metric_scope"], "item") + self.assertEqual(month_item["counter"]["data_type"], "Book_Segment") + self.assertEqual(month_item["total_requests"], 1) + self.assertEqual(month_item["unique_requests"], 1) + self.assertNotIn("scielo_issn", month_item["source"]) + self.assertNotIn("book_id", month_item["source"].get("identifiers", {})) + self.assertEqual(month_item["source"]["publisher_name"], ["SciELO Books"]) + + month_title = metrics_data["month"][ + "title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023" + ] + self.assertEqual(month_title["document"]["id"], "BOOK:Q7GTD") + self.assertEqual(month_title["document"]["type"], "book") + self.assertEqual(month_title["document"]["title"], "Book Title") + self.assertNotIn("parent_id", month_title["document"]) + self.assertEqual(month_title["counter"]["metric_scope"], "title") + self.assertEqual(month_title["counter"]["data_type"], "Book") + self.assertEqual(month_title["total_requests"], 1) + self.assertEqual(month_title["total_investigations"], 1) + self.assertEqual(month_title["unique_requests"], 1) + self.assertEqual(month_title["unique_investigations"], 1) + + year_item = metrics_data["year"][ + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|en|BR|2024|Open|Regular|2023" + ] + self.assertEqual( + year_item["access"], + {"year": "2024", "country_code": "BR", "content_language": "en"}, + ) + self.assertNotIn("daily_metrics", year_item) + self.assertEqual(year_item["document"]["title"], "Chapter Title") + self.assertEqual(year_item["counter"]["metric_scope"], "item") + self.assertEqual(year_item["total_requests"], 1) + + year_title = metrics_data["year"][ + "title|books|q7gtd|||BOOK:Q7GTD|en|BR|2024|Open|Regular|2023" + ] + self.assertEqual(year_title["counter"]["metric_scope"], "title") + self.assertEqual(year_title["document"]["title"], "Book Title") + self.assertNotIn("daily_metrics", year_title) + self.assertEqual(year_title["total_requests"], 1) + self.assertEqual(year_title["total_investigations"], 1) + self.assertEqual(year_title["unique_requests"], 1) + self.assertEqual(year_title["unique_investigations"], 1) + + def test_maps_counter_data_types_for_preprint_and_dataset(self): + data = { + "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|sess|BR|un|html|full_text": { + "collection": "preprints", + "source_key": "scielo-preprints", + "document_type": "preprint", + "pid_generic": "10.1590/SCIELOPREPRINTS.1234", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "un", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "preprint_server", + "source_id": "scielo-preprints", + "main_title": "SciELO Preprints", + }, + "publication_year": "2024", + }, + "data|scielo-data|||10.48331/SCIELODATA.ABC123|sess|BR|un|html|abstract": { + "collection": "data", + "source_key": "scielo-data", + "document_type": "dataset", + "pid_generic": "10.48331/SCIELODATA.ABC123", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "un", + "content_type": CONTENT_TYPE_ABSTRACT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "data_repository", + "source_id": "scielo-data", + "main_title": "SciELO Data", + }, + "publication_year": "2024", + }, + } + + metrics_data = index_docs.convert(data) + preprint_doc = metrics_data["month"][ + "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|2024-01|Open|Regular|2024" + ] + dataset_doc = metrics_data["month"][ + "data|scielo-data|||10.48331/SCIELODATA.ABC123|2024-01|Open|Regular|2024" + ] + + self.assertEqual(preprint_doc["counter"]["data_type"], "Article") + self.assertEqual(preprint_doc["document"]["type"], "preprint") + self.assertEqual(preprint_doc["document"]["id"], "10.1590/SCIELOPREPRINTS.1234") + self.assertEqual(preprint_doc["counter"]["article_version"], "Preprint") + self.assertEqual(dataset_doc["counter"]["data_type"], "Dataset") + self.assertNotIn("article_version", dataset_doc["counter"]) + + def test_dedupes_book_unique_item_across_formats(self): + data = { + "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|html|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:45": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + } + + metrics_data = index_docs.convert(data) + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" + ] + month_title = metrics_data["month"][ + "title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" + ] + + self.assertEqual(month_item["total_requests"], 2) + self.assertEqual(month_item["total_investigations"], 2) + self.assertEqual(month_item["unique_requests"], 1) + self.assertEqual(month_item["unique_investigations"], 1) + self.assertEqual(month_title["unique_requests"], 1) + self.assertEqual(month_title["unique_investigations"], 1) + + def test_skips_book_landing_page_from_item_scope(self): + data = { + "books|c2248|||BOOK:C2248|sess|BR|pt|html|abstract": { + "collection": "books", + "source_key": "c2248", + "document_type": "book", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248", + "document": {"title": "C2248 Book"}, + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_ABSTRACT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + } + + metrics_data = index_docs.convert(data) + self.assertEqual( + set(metrics_data["month"].keys()), + {"title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"}, + ) + self.assertEqual( + set(metrics_data["year"].keys()), + {"title|books|c2248|||BOOK:C2248|pt|BR|2024|Open|Regular|2018"}, + ) + + def test_whole_book_without_segments_counts_as_book_segment(self): + data = { + "books|c2248|||BOOK:C2248|sess|BR|pt|pdf|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "book", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248", + "document": {"title": "C2248 Book"}, + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + } + + metrics_data = index_docs.convert(data) + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" + ] + month_title = metrics_data["month"][ + "title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" + ] + + self.assertEqual(month_item["counter"]["data_type"], "Book_Segment") + self.assertEqual(month_item["counter"]["metric_scope"], "item") + self.assertEqual(month_item["document"]["id"], "BOOK:C2248") + self.assertNotIn("parent_id", month_item["document"]) + self.assertEqual(month_title["counter"]["data_type"], "Book") + self.assertEqual(month_title["counter"]["metric_scope"], "title") + + def test_aggregates_multiple_chapters_at_title_level(self): + data = { + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|session1|BR|en|html|full_text": { + "collection": "books", + "source_key": "q7gtd", + "document_type": "chapter", + "pid_generic": "BOOK:Q7GTD/CHAPTER:01", + "title_pid_generic": "BOOK:Q7GTD", + "user_session_id": "session1", + "click_timestamps": {"00:05": 1}, + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "main_title": "Book Title", + "identifiers": {"book_id": "q7gtd"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2023", + }, + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|session1|BR|en|html|full_text": { + "collection": "books", + "source_key": "q7gtd", + "document_type": "chapter", + "pid_generic": "BOOK:Q7GTD/CHAPTER:02", + "title_pid_generic": "BOOK:Q7GTD", + "user_session_id": "session1", + "click_timestamps": {"00:10": 1}, + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "main_title": "Book Title", + "identifiers": {"book_id": "q7gtd"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2023", + }, + } + + metrics_data = index_docs.convert(data) + self.assertEqual(len(metrics_data["month"]), 3) + self.assertEqual(len(metrics_data["year"]), 3) + + month_title = metrics_data["month"][ + "title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023" + ] + self.assertEqual(month_title["total_requests"], 2) + self.assertEqual(month_title["total_investigations"], 2) + self.assertEqual(month_title["unique_requests"], 1) + self.assertEqual(month_title["unique_investigations"], 1) + + def test_double_click_collapses_same_url_within_30_seconds(self): + from datetime import datetime + + from metrics.counter.access import accumulation + + results = {} + counter_access = { + "collection": "books", + "source_type": "book", + "source_id": "c2248", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "media_language": "pt", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2018", + "source_main_title": "C2248 Book", + } + base_line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "url": "/id/c2248/03?from=search", + } + + accumulation.accumulate( + results, + counter_access, + {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 5)}, + ) + accumulation.accumulate( + results, + counter_access, + {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 20)}, + ) + + metrics_data = index_docs.convert(results) + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" + ] + self.assertEqual(month_item["total_requests"], 1) + self.assertEqual(month_item["unique_requests"], 1) + + def test_article_pipeline_sets_journal_parent(self): + data = { + "scl|1234-5678||abc123||sess|BR|en|pdf|full_text": { + "collection": "scl", + "source_key": "1234-5678", + "document_type": "article", + "pid_v2": None, + "pid_v3": "abc123", + "pid_generic": None, + "document": {"title": "Article Title"}, + "user_session_id": "sess", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "journal", + "source_id": "1234-5678", + "scielo_issn": "1234-5678", + "main_title": "Test Journal", + }, + "publication_year": "2024", + } + } + + metrics_data = index_docs.convert(data) + month_doc = list(metrics_data["month"].values())[0] + + self.assertEqual(month_doc["counter"]["data_type"], "Article") + self.assertEqual(month_doc["counter"]["parent_data_type"], "Journal") + self.assertEqual(month_doc["counter"]["metric_scope"], "item") + self.assertEqual(month_doc["document"]["type"], "article") + self.assertEqual(month_doc["total_requests"], 1) + self.assertEqual(month_doc["total_investigations"], 1) + + def test_non_dict_input_returns_empty(self): + result = index_docs.convert(None) + self.assertEqual(result, {"month": {}, "year": {}}) diff --git a/metrics/tests/fixtures/counter-robots.txt b/metrics/tests/fixtures/counter-robots.txt new file mode 100644 index 0000000..f206558 --- /dev/null +++ b/metrics/tests/fixtures/counter-robots.txt @@ -0,0 +1,307 @@ +bot +^Buck\/[0-9] +spider +crawl +^.?$ +[^a]fish +^IDA$ +^ruby$ +^@ozilla\/\d +^脝脝陆芒潞贸碌脛$ +^破解后的$ +AddThis +A6-Indexer +ADmantX +alexa +Alexandria(\s|\+)prototype(\s|\+)project +AllenTrack +almaden +appie +API[\+\s]scraper +Arachni +Arachmo +architext +ArchiveTeam +aria2\/\d +arks +^Array$ +asterias +atomz +BDFetch +Betsie +baidu +biglotron +BingPreview +binlar +bjaaland +Blackboard[\+\s]Safeassign +blaiz-bee +bloglines +blogpulse +boitho\.com-dc +bookmark-manager +Brutus\/AET +BUbiNG +bwh3_user_agent +CakePHP +celestial +cfnetwork +checklink +checkprivacy +China\sLocal\sBrowse\s2\.6 +Citoid +cloakDetect +coccoc\/1\.0 +Code\sSample\sWeb\sClient +ColdFusion +collection@infegy.com +com\.plumanalytics +combine +contentmatch +ContentSmartz +convera +core +Cortana +CoverScout +crusty\/\d +curl\/ +cursor +custo +DataCha0s\/2\.0 +daum(oa)? +^\%?default\%?$ +DeuSu\/ +Dispatch\/\d +Docoloc +docomo +Download\+Master +Drupal +DSurf +DTS Agent +EasyBib[\+\s]AutoCite[\+\s] +easydl +EBSCO\sEJS\sContent\sServer +EcoSearch +ELinks\/ +EmailSiphon +EmailWolf +Embedly +EThOS\+\(British\+Library\) +facebookexternalhit\/ +favorg +FDM(\s|\+)\d +Feedbin +feedburner +FeedFetcher +feedreader +ferret +Fetch(\s|\+)API(\s|\+)Request +findlinks +findthatfile +^FileDown$ +^Filter$ +^firefox$ +^FOCA +Fulltext +Funnelback +Genieo +GetRight +geturl +GigablastOpenSource +G-i-g-a-b-o-t +GLMSLinkAnalysis +Goldfire(\s|\+)Server +google +Grammarly +grub +gulliver +gvfs\/ +harvest +heritrix +holmes +htdig +htmlparser +HttpComponents\/1.1 +HTTPFetcher +http.?client +httpget +httrack +ia_archiver +ichiro +iktomi +ilse +Indy Library +^integrity\/\d +internetseer +intute +iSiloX +iskanie +^java\/\d{1,2}.\d +jeeves +Jersey\/\d +jobo +kyluka +larbin +libcurl +libhttp +libwww +lilina +^LinkAnalyser +link.?check +LinkLint-checkonly +^LinkParser\/ +^LinkSaver\/ +linkscan +LinkTiger +linkwalker +lipperhey +livejournal\.com +LOCKSS +LongURL.API +ltx71 +lwp +lycos[_+] +mail\.ru +MarcEdit +mediapartners-google +megite +MetaURI[\+\s]API\/\d\.\d +Microsoft(\s|\+)URL(\s|\+)Control +Microsoft Office Existence Discovery +Microsoft Office Protocol Discovery +Microsoft-WebDAV-MiniRedir +mimas +mnogosearch +moget +motor +^Mozilla$ +^Mozilla.4\.0$ +^Mozilla\/4\.0\+\(compatible;\)$ +^Mozilla\/4\.0\+\(compatible;\+ICS\)$ +^Mozilla\/4\.5\+\[en]\+\(Win98;\+I\)$ +^Mozilla.5\.0$ +^Mozilla\/5.0\+\(compatible;\+MSIE\+6\.0;\+Windows\+NT\+5\.0\)$ +^Mozilla\/5\.0\+like\+Gecko$ +^Mozilla\/5.0(\s|\+)Gecko\/20100115(\s|\+)Firefox\/3.6$ +^MSIE +MuscatFerre +myweb +nagios +^NetAnts\/\d +netcraft +netluchs +newspaper\/\d +ng\/2\. +^Ning\/\d +no_user_agent +nomad +nutch +^oaDOI$ +ocelli +Offline(\s|\+)Navigator +OgScrper +okhttp +onetszukaj +^Opera\/4$ +OurBrowser +panscient +parsijoo +^Pattern\/\d +Pcore-HTTP +pear\.php\.net +perman +PHP\/ +pidcheck +pioneer +playmusic\.com +playstarmusic\.com +^Postgenomic(\s|\+)v2 +powermarks +proximic +PycURL +python +Qwantify +rambler +ReactorNetty\/\d +Readpaper +redalert +Riddler +robozilla +rss +scan4mail +scientificcommons +scirus +scooter +Scrapy\/\d +ScoutJet +^scrutiny\/\d +SearchBloxIntra +shoutcast +Site24x7 +SkypeUriPreview +slurp +sogou +speedy +sqlmap +SrceDAMP +Strider +summify +sunrise +Sysomos +T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E +tailrank +Teleport(\s|\+)Pro +Teoma +The\+Knowledge\+AI +titan +^Traackr\.com$ +Trello +Trove +Turnitin +twiceler +Typhoeus +ucsd +ultraseek +^undefined$ +^unknown$ +Unpaywall +URL2File +urlaliasbuilder +urllib +^user.?agent$ +^User-Agent +validator +virus.detector +voila +^voltron$ +voyager\/ +w3af\.org +Wanadoo +Web(\s|\+)Downloader +WebCloner +webcollage +WebCopier +Webinator +weblayers +Webmetrics +webmirror +webmon +weborama-fetcher +webreaper +WebStripper +WebZIP +Wget +WhatsApp +wordpress +worm +www\.gnip\.com +WWW-Mechanize +xenu +y!j +yacy +yahoo +yandex +Yeti\/\d +zeus +zyborg +7siters diff --git a/metrics/tests/fixtures/map.mmdb b/metrics/tests/fixtures/map.mmdb new file mode 100644 index 0000000..257d56e Binary files /dev/null and b/metrics/tests/fixtures/map.mmdb differ diff --git a/metrics/tests/fixtures/usage.books.log b/metrics/tests/fixtures/usage.books.log new file mode 100644 index 0000000..103a339 --- /dev/null +++ b/metrics/tests/fixtures/usage.books.log @@ -0,0 +1,9 @@ +186.215.90.179 - - [01/Apr/2012:00:00:29 -0300] "GET /id/xjcw9 HTTP/1.1" 200 13833 "http://books.scielo.org/search/index.php" "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B176 Safari/7534.48.3" +189.97.101.205 - - [01/Apr/2012:00:30:27 -0300] "GET /id/h8pyf/08 HTTP/1.1" 200 10775 "http://books.scielo.org/search/index.php" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; MANM)" +189.100.12.229 - - [01/Apr/2012:00:00:00 -0300] "GET /id/3hs/pdf/sampaio-9788523206277.pdf HTTP/1.1" 200 1057116 "http://books.scielo.org/id/3hs" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.83 Safari/535.11" +216.189.176.205 - - [01/Apr/2012:00:41:33 -0300] "GET /id/hd5d8/epub/gelamo-9788598605951.epub HTTP/1.1" 200 239376 "http://books.scielo.org/id/hd5d8" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4" +MISS|200|1757894367|5117|4384504|2001:4860:7:1103::|https://www.google.com/|https://books.scielo.org/id/96spq|MI|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36|8e4f6321a936335906fccc9c9d9211af|BR +MISS|200|1757894377|5899|4384504|213.135.156.0|-|https://books.scielo.org/id/3dqnm/10|DE|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36|0f9f9bf8c4fe32029f8e881aebaac4a1|RU +MISS|200|1757894381|715390|4384504|45.56.186.0|-|http://books.scielo.org/id/htnbt/pdf/caldeira-9788579830419-10.pdf|IL|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Trailer/93.3.8652.5|713aed01fd0c98aa682b5afe4e646b2c|US +MISS|200|1757894328|10148382|4384504|170.23.5.0|-|https://books.scielo.org/id/wg88m/epub/ortigoza-9788579831287.epub|IL|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3|e0d3c7f5b005566d3dbec37db52f674e|US +MISS|200|1757894338|1377|4384504|166.88.79.0|-|https://books.scielo.org/id/p8kpd/Text/12.xhtml|DE|Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.3|bc11c7eaf60775124f454befbce23cca|PL diff --git a/metrics/tests/fixtures/usage.dat.log b/metrics/tests/fixtures/usage.dat.log new file mode 100644 index 0000000..cf40425 --- /dev/null +++ b/metrics/tests/fixtures/usage.dat.log @@ -0,0 +1,30 @@ +20.171.206.17 - - [01/Sep/2024:23:59:07 -0300] "GET /dataset.xhtml;jsessionid=0212e6bc89c71a2c0d48a3f76451?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&fileSortField=date&tagPresort=false HTTP/1.1" 200 30851 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:09 -0300] "GET /dataset.xhtml;jsessionid=0212e6bc89c71a2c0d48a3f76451?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileAccess=&fileTag=&fileSortField=date&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30941 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +192.168.1.116 - - [01/Sep/2024:23:59:10 -0300] "GET /api/dataverses/preprints HTTP/1.1" 200 2659 "-" "ops/3.3.0.14" +192.168.169.235 - - [01/Sep/2024:23:59:13 -0300] "GET / HTTP/1.1" 200 28444 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" +20.171.206.17 - - [01/Sep/2024:23:59:14 -0300] "GET /dataset.xhtml;jsessionid=0212e6bc89c71a2c0d48a3f76451?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=%22Text%22&fileAccess=&fileTag=&fileSortField=date&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 29558 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:16 -0300] "GET /dataset.xhtml;jsessionid=0212e6bc89c71a2c0d48a3f76451?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=name&fileSortOrder=desc&tagPresort=false&folderPresort=true HTTP/1.1" 200 30960 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +192.168.1.116 - - [01/Sep/2024:23:59:17 -0300] "GET /api/dataverses/preprints HTTP/1.1" 200 2659 "-" "ops/3.3.0.14" +20.171.206.17 - - [01/Sep/2024:23:59:20 -0300] "GET /dataset.xhtml;jsessionid=0212e6bc89c71a2c0d48a3f76451?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=date&fileSortOrder=desc&tagPresort=false HTTP/1.1" 200 30950 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:22 -0300] "GET /dataset.xhtml;jsessionid=0212e6bc89c71a2c0d48a3f76451?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=%22Document%22&fileAccess=&fileTag=&fileSortField=date&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30711 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +192.168.1.116 - - [01/Sep/2024:23:59:25 -0300] "GET /api/dataverses/preprints HTTP/1.1" 200 2659 "-" "ops/3.3.0.14" +20.171.206.17 - - [01/Sep/2024:23:59:25 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&tagPresort=false&folderPresort=true HTTP/1.1" 200 30818 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:27 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=type&tagPresort=false HTTP/1.1" 200 30950 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:30 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=size&tagPresort=false HTTP/1.1" 200 30956 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +43.159.146.48 - - [01/Sep/2024:23:59:31 -0300] "GET /dataset.xhtml;jsessionid=e0171f1137481ce453dd659be0ac?persistentId=doi%3A10.48331%2Fscielodata.C4HFUF&version=&q=&fileTypeGroupFacet=&fileAccess=&tagPresort=false&folderPresort=true HTTP/1.1" 200 29722 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36" +20.171.206.17 - - [01/Sep/2024:23:59:32 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=name&fileSortOrder=desc&tagPresort=false&folderPresort=true HTTP/1.1" 200 30956 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:34 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileAccess=&fileTag=&fileSortField=&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30919 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:36 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30915 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:38 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=date&tagPresort=false HTTP/1.1" 200 30947 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:40 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=Restricted&fileTag=&fileSortField=&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30852 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:42 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=date&fileSortOrder=desc&tagPresort=false HTTP/1.1" 200 30950 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:43 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=%22Document%22&fileAccess=&fileTag=&fileSortField=&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30696 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:45 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileTag=&fileSortField=&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30939 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:47 -0300] "GET /dataset.xhtml;jsessionid=02146668a92ebd3249bc567cefcc?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=%22Text%22&fileAccess=&fileTag=&fileSortField=&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 29570 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +192.168.169.235 - - [01/Sep/2024:23:59:47 -0300] "GET / HTTP/1.1" 200 28437 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" +20.171.206.17 - - [01/Sep/2024:23:59:49 -0300] "GET /dataset.xhtml;jsessionid=0214dc416f0afe817703dcf8337e?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=size&tagPresort=false HTTP/1.1" 200 30946 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:51 -0300] "GET /dataset.xhtml;jsessionid=0214dc416f0afe817703dcf8337e?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=%22Document%22&fileAccess=&fileTag=&fileSortField=size&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30704 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:53 -0300] "GET /dataset.xhtml;jsessionid=0214dc416f0afe817703dcf8337e?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30921 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:55 -0300] "GET /dataset.xhtml;jsessionid=0214dc416f0afe817703dcf8337e?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=Restricted&fileTag=&fileSortField=size&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30864 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:57 -0300] "GET /dataset.xhtml;jsessionid=0214dc416f0afe817703dcf8337e?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileTypeGroupFacet=&fileAccess=&fileSortField=date&tagPresort=false HTTP/1.1" 200 30947 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" +20.171.206.17 - - [01/Sep/2024:23:59:59 -0300] "GET /dataset.xhtml;jsessionid=0214dc416f0afe817703dcf8337e?persistentId=doi%3A10.48331%2Fscielodata.S4BDSX&version=&q=&fileAccess=&fileTag=&fileSortField=size&fileSortOrder=&tagPresort=false&folderPresort=true HTTP/1.1" 200 30937 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" diff --git a/metrics/tests/fixtures/usage.log b/metrics/tests/fixtures/usage.log new file mode 100644 index 0000000..8c50b2b --- /dev/null +++ b/metrics/tests/fixtures/usage.log @@ -0,0 +1,200 @@ +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /css/screen/layout.css HTTP/1.1" 404 427 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /article.js HTTP/1.1" 303 8231 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +192.168.1.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/fbpelogp.gif HTTP/1.1" 200 1353 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:50 -0300] "GET /xsl/pmc/v3.0/xml.css HTTP/1.1" 304 5766 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/grp1c.gif HTTP/1.1" 303 181 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:50 -0300] "GET /css/screen/general.css HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:50 -0300] "GET /css/screen/styles.css HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +89.155.115.203 - - [21/May/2021:11:30:37 -0300] "GET /scielo.php?script=sci_arttext&pid=S0102-69092018000300512 HTTP/1.1" 200 44995 "https://www.google.com/" "Mozilla/5.0 (iPhone; CPU iPhone OS 13_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/137.2.345735309 Mobile/15E148 Safari/604.1" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/artsrc.gif HTTP/1.1" 502 239 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/toc.gif HTTP/1.1" 200 164 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +192.168.2.155 - - [21/May/2021:23:58:50 -0300] "GET /img/en/prev.gif HTTP/1.1" 200 244 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/next.gif HTTP/1.1" 302 193 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "POST /img/en/author.gif HTTP/1.1" 204 219 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/20231:23:58:50 -0300] "GET /img/en/fbpelogp.gif HTTP/1.1" 301 1353 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +67.205.129.249 - - [21/May/2021:05:05:16 -0300] "GET /scielo.php?download&pid=S0102-86502014000700465&format=EndNote HTTP/1.1" 200 491 "http://www.scielo.br/scielo.php?script=sci_isoref&pid=S0102-86502014000700465&lng=en" "LOCKSS cache" +176.88.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/home.gif HTTP/1.1" 200 190 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:50 -0300] "PATCH /img/en/toc.gif HTTP/1.1" 200 164 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/search.gif HTTP/1.1" 200 210 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/home.gif HTTP/1.1" 200 190 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/alpha.gif HTTP/1.1" 200 220 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" ")" +176.88.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/fulltxt.gif HTTP/1.1" 200 643 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/iconStatistics.gif HTTP/1.1" 200 1052 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:50 -0300] "GET HTTP/1.1" 200 288 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/iconPDFDocument.gif HTTP/1.1" 200 628 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/iconXMLDocument.gif HTTP/1.1" 200 652 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:50 -0300] "GET /img/en/fulltxt.gif HTTP/1.1" 200 643 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/lattescv-button.gif HTTP/1.1" 200 1041 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 200 578 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedOff.gif HTTP/1.1" 200 288 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedGoogleOn.gif HTTP/1.1" 200 641 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/grp1c.gif HTTP/1.1" 200 181 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedOff.gif HTTP/1.1" 200 262 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/css/responsive.css HTTP/1.1" 200 16842 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedGoogleOn.gif HTTP/1.1" 200 625 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/common/iconPermalink.gif HTTP/1.1" 200 382 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/common/icon-close.png HTTP/1.1" 200 3091 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/home.gif HTTP/1.1" 200 190 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/subject.gif HTTP/1.1" 200 229 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 200 578 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/revistas/ep/v30n1/a04tab01.gif HTTP/1.1" 200 47388 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/lattescv-button.gif HTTP/1.1" 200 1041 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/revistas/ep/v30n1/seta.gif HTTP/1.1" 200 164 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /article.js HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedOff.gif HTTP/1.1" 200 262 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/revistas/ep/v30n1/a04img01.gif HTTP/1.1" 200 115469 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/css/style.css HTTP/1.1" 200 14769 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /scielo.php?script=sci_arttext&pid=S1679-39512005000100007 HTTP/1.1" 200 37818 "http://www.google.com/" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fbpelogp.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 200 578 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt HTTP/1.1" 200 35262 "https://scholar.google.com.br/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/revistas/ep/v30n1/a04img02.gif HTTP/1.1" 200 108500 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/grp1c.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/search.gif HTTP/1.1" 200 210 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/revistas/ep/v30n1/a04img03.gif HTTP/1.1" 200 51911 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +45.146.0.2 - - [21/May/2021:23:58:51 -0300] "HEAD /scielo.php?script=sci_arttext&pid=S1984-82502011000200005 HTTP/1.1" 200 19267 "http://www.scielo.br:80/scielo.php',(CAST((CHR(113)||CHR(98)||CHR(113)||CHR(98)||CHR(113))||(SELECT (CASE WHEN (3633=3633) THEN 1 ELSE 0 END))::text||(CHR(113)||CHR(98)||CHR(98)||CHR(107)||CHR(113)) AS NUMERIC))-- iAjd" "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; fr-fr) AppleWebKit/125.5.5 (KHTML, like Gecko) Safari/125.11" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconStatistics.gif HTTP/1.1" 200 1052 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/e-mailt.gif HTTP/1.1" 200 586 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedGoogleOn.gif HTTP/1.1" 200 641 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/artsrc.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/toc.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen.css HTTP/1.1" 200 89 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/common/iconPermalink.gif HTTP/1.1" 200 382 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/prev.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedGoogleOn.gif HTTP/1.1" 200 625 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/home.gif HTTP/1.1" 200 190 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +177.17.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/pt/fbpelogp.gif HTTP/1.1" 200 1353 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1415-43662015000600534&lng=pt&tlng=pt" "Mozilla/5.0 (Linux; Android 10; SM-A105M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.17.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/pt/artsrc.gif HTTP/1.1" 200 270 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1415-43662015000600534&lng=pt&tlng=pt" "Mozilla/5.0 (Linux; Android 10; SM-A105M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +45.146.0.2 - - [21/May/2021:23:58:51 -0300] "GET /scielo.php?script=sci_arttext&pid=S1984-82502015000200317 HTTP/1.1" 200 15467 "http://www.scielo.br:80/scielo.php" "-2051 OR 6187=(SELECT UPPER(XMLType(CHR(60)||CHR(58)||CHR(113)||CHR(122)||CHR(98)||CHR(122)||CHR(113)||(SELECT (CASE WHEN (6187=6187) THEN 1 ELSE 0 END) FROM DUAL)||CHR(113)||CHR(112)||CHR(122)||CHR(98)||CHR(113)||CHR(62))) FROM DUAL)# Gheq" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /xsl/pmc/v3.0/xml.css HTTP/1.1" 200 5766 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/next.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +114.119.0.0 - - [21/May/2021:23:58:51 -0300] "GET /scielo.php?script=sci_abstract&pid=S0104-59702008000200001&lng=pt&nrm=iso&tlng=pt HTTP/1.1" 404 574 "-" "Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; PetalBot;+https://webmaster.petalsearch.com/site/petalbot)" +45.70.0.0 - - [21/May/2021:23:58:51 -0300] "GET /pdf/ld/v16n2/1518-7632-ld-16-02-00261.pdf HTTP/1.1" 200 937525 "https://www.google.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +211.218.0.2 - - [21/May/2021:23:58:51 -0300] "GET /img/revistas/rb/v48n6//0100-3984-rb-48-06-0345-gf02.jpg HTTP/1.1" 200 471319 "https://www.google.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/styles.css HTTP/1.1" 200 3572 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/layout.css HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/prev.gif HTTP/1.1" 200 244 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/artsrc.gif HTTP/1.1" 200 239 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/grp1c.gif HTTP/1.1" 200 181 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/next.gif HTTP/1.1" 200 193 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/alpha.gif HTTP/1.1" 200 220 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/next.gif HTTP/1.1" 200 193 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconXMLDocument.gif HTTP/1.1" 200 652 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/artsrc.gif HTTP/1.1" 200 239 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fbpelogp.gif HTTP/1.1" 200 1353 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/toc.gif HTTP/1.1" 200 164 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/prev.gif HTTP/1.1" 200 244 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/next.gif HTTP/1.1" 200 193 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +177.17.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/pt/grp1c.gif HTTP/1.1" 200 202 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1415-43662015000600534&lng=pt&tlng=pt" "Mozilla/5.0 (Linux; Android 10; SM-A105M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +157.38.0.1 - - [21/May/2021:23:58:51 -0300] "GET /scielo.php?script=sci_arttext&pid=S0103-50532011000300020 HTTP/1.1" 200 15815 "https://www.google.com/" "Mozilla/5.0 (Linux; Android 8.1.0; Redmi Note 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /css/screen.css HTTP/1.1" 200 89 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/author.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/general.css HTTP/1.1" 200 133 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/layout.css HTTP/1.1" 200 427 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/subject.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /applications/scielo-org/js/toolbox.js HTTP/1.1" 200 3653 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/search.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/home.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/alpha.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/styles.css HTTP/1.1" 200 3572 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconStatistics.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /article.js HTTP/1.1" 200 8231 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconPDFDocument.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconXMLDocument.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/author.gif HTTP/1.1" 200 219 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/js/modernizr.custom.js HTTP/1.1" 200 11295 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /applications/scielo-org/js/jquery-1.4.2.min.js HTTP/1.1" 200 72174 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/alpha.gif HTTP/1.1" 200 220 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/general.css HTTP/1.1" 200 133 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/layout.css HTTP/1.1" 200 427 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/styles.css HTTP/1.1" 200 3572 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fulltxt.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedOff.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedGoogleOn.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedOff.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedGoogleOn.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /google_metrics/get_h5_m5.php?issn=0104-6632&callback=jsonp1621652375255 HTTP/1.1" 200 155 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /article.js HTTP/1.1" 200 8231 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconPDFDocument.gif HTTP/1.1" 200 628 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fbpelogp.gif HTTP/1.1" 200 1353 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/grp1c.gif HTTP/1.1" 200 181 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/artsrc.gif HTTP/1.1" 200 239 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/toc.gif HTTP/1.1" 200 164 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/common/iconPermalink.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/common/icon-close.png HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/e-mailt.gif HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/expandd2.png HTTP/1.1" 200 1487 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/prev.gif HTTP/1.1" 200 244 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/collapsed2.png HTTP/1.1" 200 339 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/next.gif HTTP/1.1" 200 193 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/css/bootstrap.min.css HTTP/1.1" 200 106092 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/subject.gif HTTP/1.1" 200 229 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/fulltxt.gif HTTP/1.1" 200 643 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedOff.gif HTTP/1.1" 200 288 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconStatistics.gif HTTP/1.1" 200 1052 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/lattescv-button.gif HTTP/1.1" 200 1041 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fbpelogp.gif HTTP/1.1" 200 1353 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/toc.gif HTTP/1.1" 200 164 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/grp1c.gif HTTP/1.1" 200 181 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/artsrc.gif HTTP/1.1" 200 239 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/next.gif HTTP/1.1" 200 193 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/prev.gif HTTP/1.1" 200 244 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedOff.gif HTTP/1.1" 200 288 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconPDFDocument.gif HTTP/1.1" 200 628 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/toc.gif HTTP/1.1" 200 164 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/prev.gif HTTP/1.1" 200 244 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +201.182.0.1 - - [21/May/2021:23:58:51 -0300] "GET /scielo.php?pid=S1414-81452019000200211&script=sci_arttext&tlng=pt HTTP/1.1" 200 27425 "https://www.google.com/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/common/icon-close.png HTTP/1.1" 200 3091 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/subject.gif HTTP/1.1" 200 229 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconStatistics.gif HTTP/1.1" 200 1052 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/home.gif HTTP/1.1" 200 190 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/search.gif HTTP/1.1" 200 210 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/author.gif HTTP/1.1" 200 219 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/alpha.gif HTTP/1.1" 200 220 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/collapsed2.png HTTP/1.1" 200 339 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/expandd2.png HTTP/1.1" 200 1487 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/js/jquery.1.9.1.min.js HTTP/1.1" 200 92630 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fulltxt.gif HTTP/1.1" 200 643 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/author.gif HTTP/1.1" 200 219 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/subject.gif HTTP/1.1" 200 229 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconXMLDocument.gif HTTP/1.1" 200 652 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/search.gif HTTP/1.1" 200 210 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/home.gif HTTP/1.1" 200 190 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/alpha.gif HTTP/1.1" 200 220 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconStatistics.gif HTTP/1.1" 200 1052 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/fulltxt.gif HTTP/1.1" 200 643 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +177.52.0.1 - - [21/May/2021:23:58:51 -0300] "GET /google_metrics/get_h5_m5.php?issn=1517-9702&callback=jsonp1621652390876 HTTP/1.1" 200 155 "https://www.scielo.br/scielo.php?pid=S1517-97022004000100004&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconPDFDocument.gif HTTP/1.1" 200 628 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconXMLDocument.gif HTTP/1.1" 200 652 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fulltxt.gif HTTP/1.1" 200 643 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +190.232.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 200 578 "http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1809-29502015000300246&lang=es" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedOff.gif HTTP/1.1" 200 288 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 200 578 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/lattescv-button.gif HTTP/1.1" 200 1041 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/collapsed2.png HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +201.182.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen.css HTTP/1.1" 200 89 "https://www.scielo.br/scielo.php?pid=S1414-81452019000200211&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/expandd2.png HTTP/1.1" 304 0 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +174.249.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/alpha.gif HTTP/1.1" 200 220 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132018000200083" "Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/js/bootstrap.min.js HTTP/1.1" 200 28538 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +186.225.0.1 - - [21/May/2021:23:58:51 -0300] "GET /google_metrics/get_h5_m5.php?issn=1516-8484&callback=jsonp1621652389467 HTTP/1.1" 200 155 "https://www.scielo.br/scielo.php?pid=S1516-84842007000300007&script=sci_abstract&tlng=es" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedGoogleOn.gif HTTP/1.1" 200 641 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedGoogleOn.gif HTTP/1.1" 200 625 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedOff.gif HTTP/1.1" 200 262 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/fulltxt.gif HTTP/1.1" 200 643 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconXMLDocument.gif HTTP/1.1" 200 652 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconPDFDocument.gif HTTP/1.1" 200 628 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +201.182.0.1 - - [21/May/2021:23:58:51 -0300] "GET /xsl/pmc/v3.0/xml.css HTTP/1.1" 200 5766 "https://www.scielo.br/scielo.php?pid=S1414-81452019000200211&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/fulltxt.gif HTTP/1.1" 200 643 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/lattescv-button.gif HTTP/1.1" 200 1041 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconTranslation.gif HTTP/1.1" 200 578 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +176.88.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/fbpe/bjce/v17n4-7/a4f1.gif HTTP/1.1" 200 9521 "https://www.scielo.br/scielo.php?pid=S0104-66322000000400005&script=sci_arttext" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/common/iconPermalink.gif HTTP/1.1" 200 382 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +177.185.0.0 - - [21/May/2021:23:58:51 -0300] "GET /img/common/icon-close.png HTTP/1.1" 200 3091 "https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512005000100007" "Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedOff.gif HTTP/1.1" 200 288 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +201.182.0.1 - - [21/May/2021:23:58:51 -0300] "GET /css/screen/general.css HTTP/1.1" 200 133 "https://www.scielo.br/css/screen.css" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +201.182.0.1 - - [21/May/2021:23:58:51 -0300] "GET /article.js HTTP/1.1" 200 8231 "https://www.scielo.br/scielo.php?pid=S1414-81452019000200211&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +54.241.0.1 - - [21/May/2021:23:58:51 -0300] "GET / HTTP/1.1" 200 2935 "-" "Amazon-Route53-Health-Check-Service (ref 1261cdc1-a132-45b2-8c26-5de713c689cb; report http://amzn.to/1vsZADi)" +191.240.0.2 - - [21/May/2021:23:58:51 -0300] "GET /xsl/plus/static/js/scielo-article.js HTTP/1.1" 200 4329 "https://www.scielo.br/scielo.php?pid=S1415-65552014000600874&script=sci_arttext_plus&tlng=pt" "Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconRelatedOff.gif HTTP/1.1" 200 262 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +138.97.0.1 - - [21/May/2021:23:58:51 -0300] "GET /img/en/iconCitedGoogleOn.gif HTTP/1.1" 200 641 "https://www.scielo.br/scielo.php?pid=S0100-40422017000700791&script=sci_arttext&tlng=pt" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" diff --git a/metrics/tests/fixtures/usage.scl.bunnynet.log b/metrics/tests/fixtures/usage.scl.bunnynet.log new file mode 100644 index 0000000..620dd48 --- /dev/null +++ b/metrics/tests/fixtures/usage.scl.bunnynet.log @@ -0,0 +1,67 @@ +HIT|200|1757548785|9146|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/FAPESP.png|CN|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36|7caaeff65a64c5235f863868a7c94d69|CN +HIT|200|1757548786|5432|4339610|186.225.0.1|-|https://www.scielo.br/j/neco/a/dqLRqnpmnncSmnzMCB8bzPG/|BR|Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36|8dbbeef65a64c5235f863868a7c94d70|BR +MISS|200|1757548787|12345|4339610|177.52.0.1|https://www.google.com|https://www.scielo.br/j/psoc/a/hbSYnTbyNfzxcWT3FpXrL5G/?lang=es|BR|Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36|9eccfef65a64c5235f863868a7c94d71|BR +HIT|200|1757548788|7890|4339610|200.144.0.1|-|https://www.scielo.br/j/rbz/a/cKnLLBn5NnshCX93Y6qYpHv/?format=pdf|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0|afdd0ef65a64c5235f863868a7c94d72|BR +MISS 200 1755473649 29321 4339610 185.29.10.0 - http://www.scielo.br/j/rbb/a/qvkmfPDpQk4zZfSnWXJHrVQ/?lang=pt SE Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0 5dc9f2b1416a10af31321b8aad30b8f4 SE +MISS|200|1755473648|26413|4339610|185.29.10.0|-|http://www.scielo.br/j/pab/a/xBnG6SmJRwz7Hzs8dVKNfqv/?lang=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|b9824b1192f138e1c09c6f1aceea92a0|SE +MISS|200|1755473648|30693|4339610|185.29.10.0|-|http://www.scielo.br/j/cagro/a/z8d5Z5hGJxZgLRgQvcGLMdf/?lang=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|a544b893e26f389c7b1045246d756831|SE +MISS|301|1755473648|1435|4339610|185.29.10.0|-|http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-84042002000300008&lng=pt&nrm=iso&tlng=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|d0b6cb231fafac81cf42c524d05e0882|SE +MISS|301|1755473648|1448|4339610|185.29.10.0|-|http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-204X2005000400015&lng=pt&tlng=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|88fa5720e9fac38470a00ef6f8e6d35a|SE +MISS|301|1755473647|1455|4339610|185.29.10.0|-|http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1413-70542010000100007&lng=pt&tlng=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|73c06380eecbdad38e094a3623592fc7|SE +MISS|200|1755473644|37617|4339610|185.29.10.0|-|http://www.scielo.br/j/rbz/a/CKSH5K8T7x7Y84zMnSb7L4L/?lang=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|61d8e51d071e39d6b7b770ddf6406ae6|SE +MISS|301|1755473643|1447|4339610|185.29.10.0|-|http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-35982007001000002&lng=pt&tlng=pt|SE|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0|74b6a623493442ba3535e0a374f67229|SE +MISS|301|1755469323|1160|4339610|23.98.186.0|-|https://www.scielo.br/j/pcp/a/K4kBgKXqVW5HJt8WQT6chKd/|TX|Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot|f5ce4a53efe52ca927fe07ba147e2547|US +MISS|404|1755469323|3206|4339610|23.98.186.0|-|https://www.scielo.br/j/dpjo/a/cpSn3rmDvrkMNTHj7bsPxgh|TX|Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot|63df76250a368cbf739b066379603c37|US +HIT|200|1755387106|2362|4339610|116.5.172.0|-|https://www.scielo.br/media/images/BVS.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.43 Safari/537.36|42e0b299d66e93689d4233f961e421c1|CN +HIT|200|1755387106|1396|4339610|116.5.172.0|-|https://www.scielo.br/static/img/logo-open-access.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.43 Safari/537.36|b217d8614d19b1975be34ce4dce17b9a|CN +HIT|200|1755387106|116279|4339610|116.5.172.0|-|http://www.scielo.br/static/js/scielo-bundle-min.js|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.43 Safari/537.36|38f8e569d4db49213370cf57a13d2bcd|CN +MISS|200|1755387105|35358|4339610|113.64.81.0|-|http://www.scielo.br/.bunny-shield/bd/bunnyprint.js|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.29 Safari/537.36|77d2570ea71505716432f7ee5971f3f2|CN +HIT|200|1755387105|4237|4339610|113.64.81.0|-|https://www.scielo.br/media/images/BIREME.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.29 Safari/537.36|225052909be452a0d7820870b9eeee0c|CN +HIT|200|1755387105|8818|4339610|113.64.81.0|-|https://www.scielo.br/media/images/FAPESP.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.29 Safari/537.36|0be807dc181599a8edd0018a0432453e|CN +HIT|200|1755387105|6897|4339610|113.64.81.0|-|https://www.scielo.br/media/images/CAPES.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.29 Safari/537.36|f9a82944fa63468affa03834cba80575|CN +HIT|200|1755387105|8602|4339610|113.64.81.0|-|https://www.scielo.br/media/images/CNPq.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.29 Safari/537.36|be79fbe4041ef8f959913833e962082c|CN +HIT|200|1755387105|5085|4339610|113.64.81.0|-|https://www.scielo.br/media/images/FAP-UNIFESP.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.29 Safari/537.36|11889aa0e0b68b2e112ac86ccf6efc21|CN +MISS|200|1755387228|263055|4339610|190.216.61.0|https://www.scielo.br/j/cadbto/a/Rj4pnrVyh3Pt9MnJ9pkNZtM/?format=pdf&lang=en|https://www.scielo.br/j/cadbto/a/Rj4pnrVyh3Pt9MnJ9pkNZtM/?format=pdf&lang=en|AR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36|510a20260a79caca62a837dc37eb883f|AR +HIT|200|1757548785|9146|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/FAPESP.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|7caaeff65a64c5235f863868a7c94d69|CN +HIT|200|1757548784|2886|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/logo-cnpq--dark.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|f95f87b22e0459457e2690a831b655b0|CN +MISS|200|1757548784|1283|4339610|107.172.204.0|-|https://www.scielo.br/j/rdgv/i/2023.v19/|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36|c44b e4b3a0e28476cc7ecd85536dd5dc|US +HIT|200|1757548783|7221|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/logo-bvs.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|fd3b2b560bd40042517330d996066cb4|CN +HIT|200|1757548783|4566|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/BIREME.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|da6a4e9ad37b74579423c248c4504c72|CN +HIT|200|1757548782|154545|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/logo-capes--dark.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|b8be8b7dc04b7282aa6a6ba039369c04|CN +HIT|200|1757548782|7225|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/CAPES.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|5640aff5a6b6be98ffa42d4ec816ea5f|CN +HIT|200|1757548782|8930|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/CNPq.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|56f0e22b76f07607da2a7299a0c0fd60|CN +HIT|200|1757548782|2690|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/BVS.png|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|4afb166068f9c5ede0a3acd1dbe470ce|CN +HIT|200|1757548781|15441|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/logo-bireme--dark.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|cb2c33fee71a102fdb2a6fd24e85405d|CN +MISS|200|1757548781|13125|4339610|240e:3b0:7c11:e2ba::|-|http://www.scielo.br/|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.46 Safari/537.36|fe8f9167050f4a7fc499c9a74c7e884a|CN +HIT|200|1757548780|194279|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/img-post-blog-scielo-exemplo.jpg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|658b76904d833f16bdc1ea74a202a759|CN +HIT|200|1757548780|234005|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/logo-capes.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|818432b6ce34ce9c24c333dec8ffedfa|CN +MISS|200|1757548779|1290|4339610|172.178.140.0|-|https://www.scielo.br/j/sdeb/a/Jbg5jB3yFMBQjnyJkcTfy3f/?format=pdf&lang=en|LA|Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0;+https://openai.com/bot|c02427cd6e5c12480d6497ed170b85e8|US +HIT|200|1757548779|2624|4339610|240e:3b0:a00e:10a3::|-|http://www.scielo.br/static/img/logo-fapesp.svg|LA|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.27 Safari/537.36|aa8664ca05c405df632d1c78b5cdf2ec|CN +HIT|206|1757548784|8472|4339610|2804:1b2:d144:5193::|-|https://www.scielo.br/static/img/favicon.ico|BR|WhatsApp/2.23.20.0|5aa2d7bc7ab139a391f03de133f0feac|BR +HIT|206|1757548784|7816|4339610|2804:1b2:d144:5193::|-|https://www.scielo.br/media/images/pusf_glogo.gif|BR|WhatsApp/2.23.20.0|8114c288c6ffca53ba3dbe7c0bdf284d|BR +MISS|200|1757548784|32087|4339610|2804:1b2:d144:5193::|-|https://www.scielo.br/j/pusf/a/8pqYN36tQsDYg8PMTqmRscs/?format=html&lang=pt|BR|WhatsApp/2.23.20.0|ba7f527cab609a10a6dd4d8bc2219f1b|BR +HIT|200|1757548784|5686|4339610|201.87.253.0|https://www.scielo.br/j/rdbci/a/7qMcGcKQbBsqqNxyTQgJnyr/?lang=pt|https://www.scielo.br/media/images/rdbci_logo.png|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36|a79e6a873e65fd9edc8ecd5538fc44e2|BR +MISS|200|1757548784|35214|4339610|201.87.253.0|-|https://www.scielo.br/j/rdbci/a/7qMcGcKQbBsqqNxyTQgJnyr/?lang=pt|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36|1fed1c099b06cb7673ef5de69b109f7b|BR +HIT|200|1757548784|2898|4339610|2804:1b2:d144:5193::|https://www.scielo.br/j/pusf/a/8pqYN36tQsDYg8PMTqmRscs/?format=html&lang=pt|https://www.scielo.br/static/img/favicon.ico|BR|Mozilla/5.0 (iPhone; CPU iPhoneOS 18_6_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Mobile/15E148 Safari/604.1|8dbd7649b4b2784bafb98d8247d2a12a|BR +MISS|301|1757548784|836|4339610|2804:214:8213:d647::|-|https://www.scielo.br/pdf/reeusp/v49n2/pt_0080-6234-reeusp-49-02-0261.pdf|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36|7b2e26c0e349bf7adc67162d9abd48b0|BR +MISS|200|1757548783|4027|4339610|191.37.20.0|https://www.scielo.br/static/css/bootstrap.css?v=1.1.20|https://www.scielo.br/static/fonts/scielo-social-network.ttf?dhp6e8|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|b27531886c923c5f20d554a9028f5055|BR +MISS|301|1757548783|1102|4339610|201.87.253.0|-|https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1678-765X2025000100400&lang=pt|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36|fd8c3cb118c1c175474d680d07eec150|BR +HIT|200|1757548783|4058|4339610|191.37.20.0|https://www.scielo.br/static/css/bootstrap.css?v=1.1.20|https://www.scielo.br/static/img/logo-scielo-no-label.svg|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|db9cbaa6b15abaec43f0c14f9c86a3c1|BR +HIT|200|1757548783|6382|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/static/img/scimago.svg|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|be05de389aad37df3b85cf31c11bef32|BR +MISS|200|1757548783|670|4339610|2804:29b8:5068:8edd::|https://www.scielo.br/j/sausoc/a/bsyjWnYqPyyJHnBYd5zrL5x/?format=html&lang=pt|https://www.scielo.br/.bunny-shield/bunnyprint/collect|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|c37c550b4f36633edeb894ba7a1586df|BR +HIT|304|1757548782|790|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/media/images/FAPESP.png|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|ed3109ecc86838b8c23870605c2471e3|BR +HIT|304|1757548782|786|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/media/images/BVS.png|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|5208750de8e07c0de695daf7f481a1a4|BR +HIT|304|1757548782|789|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/media/images/CNPq.png|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|845fedf70d4a8f348f1f99bf67d1d4ef|BR +HIT|304|1757548782|793|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/media/images/FAP-UNIFESP.png|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|321611726563c46cb2d2411cac9fa5f8|BR +HIT|304|1757548782|789|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/media/images/BIREME.png|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|f29948a6125e436b09d7196d61b977b3|BR +HIT|304|1757548782|788|4339610|187.92.246.0|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|https://www.scielo.br/media/images/CAPES.png|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|7b95a8b106b64f0ce2203c6105162699|BR +HIT|200|1757548782|7807|4339610|2804:1b2:d144:5193::|-|https://www.scielo.br/media/images/pusf_glogo.gif|BR|NetworkingExtension/8621.3.11.10.3 Network/4277.140.33 iOS/18.6.2|3f7e286388473e03c302450f29217375|BR +HIT|200|1757548782|2888|4339610|2804:1b2:d144:5193::|-|https://www.scielo.br/static/img/favicon.ico|BR|NetworkingExtension/8621.3.11.10.3 Network/4277.140.33 iOS/18.6.2|0069ba1a0d787b47ac27732c75cbffb7|BR +MISS|200|1757548782|23087|4339610|187.92.246.0|https://www.google.com/|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/?format=html&lang=pt|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|615444ecda7ae663140a7c740be495fd|BR +MISS|200|1757548782|32372|4339610|2804:29b8:5068:8edd::|https://www.scielo.br/j/sausoc/a/bsyjWnYqPyyJHnBYd5zrL5x/?format=html&lang=pt|https://www.scielo.br/.bunny-shield/bd/bunnyprint.js|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|b69221d23f7981991c114d52941d32ce|BR +HIT|200|1757548782|10181|4339610|2804:29b8:5068:8edd::|https://www.scielo.br/j/sausoc/a/bsyjWnYqPyyJHnBYd5zrL5x/?format=html&lang=pt|https://www.scielo.br/media/images/sausoc_glogo.gif|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|4601f8d68bd57394d278ee1ea81a376f|BR +MISS|200|1757548782|21786|4339610|2804:29b8:5068:8edd::|https://www.google.com/|https://www.scielo.br/j/sausoc/a/bsyjWnYqPyyJHnBYd5zrL5x/?format=html&lang=pt|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|6f31f80b57c8fc094b0907c05ab49c5e|BR +HIT|200|1757548782|4066|4339610|143.107.252.0|https://www.scielo.br/static/css/bootstrap.css?v=1.1.20|https://www.scielo.br/static/img/logo-scielo-no-label.svg|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36|8791ce286667fad03c4e67eba215f450|BR +MISS|200|1757548782|632|4339610|143.107.252.0|https://www.scielo.br/j/jvatitd/a/NGR7dTyggrnGBSxvQ6pdSJs/?lang=en|https://www.scielo.br/.bunny-shield/bunnyprint/collect|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36|4d31046b0f5d09b2c8aaf88281be4e08|BR +HIT|200|1757548782|905|4339610|143.107.252.0|https://www.scielo.br/static/css/article.css?v=1.1.20|https://www.scielo.br/static/img/dashline.png|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36|0377aa19b04b0ff1b8b4f4589a07600f|BR +MISS|301|1757548782|1142|4339610|187.92.246.0|https://www.google.com/|https://www.scielo.br/j/rbccv/a/qYJb4RL66h5Wpmg6X5KJ6Sm/|BR|Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Mobile Safari/537.36|33fde882099e2827cc7975e34f0f1e91|BR \ No newline at end of file diff --git a/metrics/tests/fixtures/user_agents.txt b/metrics/tests/fixtures/user_agents.txt new file mode 100644 index 0000000..60561f0 --- /dev/null +++ b/metrics/tests/fixtures/user_agents.txt @@ -0,0 +1,13 @@ +"-2051 OR 6187=(SELECT UPPER(XMLType(CHR(60)||CHR(58)||CHR(113)||CHR(122)||CHR(98)||CHR(122)||CHR(113)||(SELECT (CASE WHEN (6187=6187) THEN 1 ELSE 0 END) FROM DUAL)||CHR(113)||CHR(112)||CHR(122)||CHR(98)||CHR(113)||CHR(62))) FROM DUAL)# Gheq" +"Amazon-Route53-Health-Check-Service (ref 1261cdc1-a132-45b2-8c26-5de713c689cb; report http://amzn.to/1vsZADi)" +"Mozilla/5.0 (iPhone; CPU iPhone OS 13_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/137.2.345735309 Mobile/15E148 Safari/604.1" +"Mozilla/5.0 (Linux; Android 11; SM-N981U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Mobile Safari/537.36" +"Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; PetalBot;+https://webmaster.petalsearch.com/site/petalbot)" +"Mozilla/5.0 (Linux; Android 8.1.0; Redmi Note 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +"Mozilla/5.0 (Linux; Android 8.1.0; SM-J260MU) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +"Mozilla/5.0 (Linux; Android 9; SM-G9600) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.210 Mobile Safari/537.36" +"Mozilla/5.0 (Macintosh; U; PPC Mac OS X; fr-fr) AppleWebKit/125.5.5 (KHTML, like Gecko) Safari/125.11" +"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" +"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" +"LOCKSS cache" \ No newline at end of file diff --git a/metrics/tests/integration/__init__.py b/metrics/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/integration/test_books_log_to_metrics.py b/metrics/tests/integration/test_books_log_to_metrics.py new file mode 100644 index 0000000..b898a9b --- /dev/null +++ b/metrics/tests/integration/test_books_log_to_metrics.py @@ -0,0 +1,185 @@ +import unittest +from datetime import datetime +from pathlib import Path + +from scielo_usage_counter.translator.books import URLTranslatorBooksSite +from scielo_usage_counter.url_translator import URLTranslationManager + +from metrics.counter.access import accumulation, extraction, validation +from metrics.counter.indexing import converter as index_docs +from scielo_usage_counter import log_handler + +FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures" + + +class TestBooksLogToMetrics(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.robots_list = (FIXTURES_DIR / "counter-robots.txt").read_text().splitlines() + cls.mmdb_data = (FIXTURES_DIR / "map.mmdb").read_bytes() + cls.log_path = str(FIXTURES_DIR / "usage.books.log") + cls.utm = URLTranslationManager( + documents_metadata=iter([]), + sources_metadata=iter([]), + translator=URLTranslatorBooksSite, + ) + + def _parse_log(self): + parser = log_handler.LogParser( + mmdb_data=self.mmdb_data, + robots_list=self.robots_list, + output_mode="dict", + ) + parser.logfile = self.log_path + return list(parser.parse()) + + def _extract_all(self, lines): + extracted = [] + for line in lines: + url = line.get("url") + if not url: + continue + translated = self.utm.translate(url) + if translated and isinstance(translated, dict): + counter_access = extraction.extract("books", translated) + extracted.append((counter_access, line)) + return extracted + + def test_parser_yields_lines_from_mixed_formats(self): + lines = self._parse_log() + self.assertGreater(len(lines), 0) + + def test_translation_extracts_book_ids(self): + lines = self._parse_log() + extracted = self._extract_all(lines) + self.assertGreater(len(extracted), 0) + + source_ids = {ca.get("source_id") for ca, _ in extracted} + self.assertGreater(len(source_ids), 0) + for ca, _ in extracted: + self.assertEqual(ca["source_type"], "book") + self.assertIsNotNone(ca.get("pid_generic")) + + def test_extraction_produces_book_and_chapter_types(self): + lines = self._parse_log() + extracted = self._extract_all(lines) + doc_types = {ca.get("document_type") for ca, _ in extracted} + self.assertTrue(doc_types & {"book", "chapter"}) + + def test_resolves_country_codes_via_geoip(self): + lines = self._parse_log() + countries = {line.get("country_code") for line in lines} + countries.discard(None) + self.assertGreater(len(countries), 0) + + def test_ipv6_address_is_parsed(self): + lines = self._parse_log() + has_ipv6 = any("::" in (line.get("ip_address") or "") for line in lines) + self.assertTrue(has_ipv6) + + def test_pdf_and_epub_formats_detected(self): + lines = self._parse_log() + extracted = self._extract_all(lines) + formats = {ca.get("media_format") for ca, _ in extracted} + self.assertTrue(len(formats) > 0) + + def test_full_pipeline_with_synthetic_metadata(self): + results = {} + counter_access = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "xjcw9", + "document_type": "book", + "book_id": "xjcw9", + "book_title": "Test Book", + "pid_generic": "book:xjcw9", + "title_pid_generic": "book:xjcw9", + "media_language": "pt", + "media_format": "html", + "content_type": "full_text", + }, + ) + + valid, _ = validation.is_valid(counter_access) + self.assertTrue(valid) + + accumulation.accumulate( + results, + counter_access, + { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "186.215.90.179", + "country_code": "BR", + "local_datetime": datetime(2012, 4, 1, 0, 0, 29), + }, + ) + + metrics = index_docs.convert(results) + self.assertGreater(len(metrics["month"]), 0) + self.assertGreater(len(metrics["year"]), 0) + + has_item = False + has_title = False + for doc in metrics["month"].values(): + scope = doc["counter"]["metric_scope"] + if scope == "item": + has_item = True + self.assertEqual(doc["counter"]["data_type"], "Book_Segment") + elif scope == "title": + has_title = True + self.assertEqual(doc["counter"]["data_type"], "Book") + + self.assertTrue(has_item) + self.assertTrue(has_title) + + def test_all_metric_fields_present_in_converted_document(self): + results = {} + counter_access = extraction.extract( + "books", + { + "source_type": "book", + "source_id": "h8pyf", + "document_type": "chapter", + "book_id": "h8pyf", + "chapter_id": "08", + "pid_generic": "book:h8pyf/chapter:08", + "title_pid_generic": "book:h8pyf", + "media_language": "pt", + "media_format": "html", + "content_type": "full_text", + "book_title": "Book H8PYF", + "chapter_title": "Chapter 08", + }, + ) + accumulation.accumulate( + results, + counter_access, + { + "client_name": "MSIE", + "client_version": "9.0", + "ip_address": "189.97.101.205", + "country_code": "BR", + "local_datetime": datetime(2012, 4, 1, 0, 30, 27), + }, + ) + + metrics = index_docs.convert(results) + for doc in metrics["month"].values(): + self.assertIn("total_requests", doc) + self.assertIn("total_investigations", doc) + self.assertIn("unique_requests", doc) + self.assertIn("unique_investigations", doc) + self.assertIn("collection", doc) + self.assertIn("source", doc) + self.assertIn("document", doc) + self.assertIn("counter", doc) + self.assertIn("access", doc) + self.assertIn("daily_metrics", doc) + + for doc in metrics["year"].values(): + access = doc.get("access", {}) + self.assertIn("year", access) + self.assertNotIn("daily_metrics", doc) diff --git a/metrics/tests/integration/test_bunnynet_log_to_metrics.py b/metrics/tests/integration/test_bunnynet_log_to_metrics.py new file mode 100644 index 0000000..084c831 --- /dev/null +++ b/metrics/tests/integration/test_bunnynet_log_to_metrics.py @@ -0,0 +1,45 @@ +import unittest +from pathlib import Path + +from scielo_usage_counter import log_handler + +FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures" + + +class TestBunnynetLogToMetrics(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.robots_list = (FIXTURES_DIR / "counter-robots.txt").read_text().splitlines() + cls.mmdb_data = (FIXTURES_DIR / "map.mmdb").read_bytes() + cls.log_path = str(FIXTURES_DIR / "usage.scl.bunnynet.log") + + def _parse_log(self): + parser = log_handler.LogParser( + mmdb_data=self.mmdb_data, + robots_list=self.robots_list, + output_mode="dict", + ) + parser.logfile = self.log_path + return list(parser.parse()), parser.stats + + def test_parses_bunnynet_pipe_separated_format(self): + lines, stats = self._parse_log() + self.assertGreater(len(lines), 0) + + def test_extracts_urls_from_bunnynet_format(self): + lines, _ = self._parse_log() + urls = [line.get("url") for line in lines if line.get("url")] + self.assertGreater(len(urls), 0) + + def test_resolves_country_codes(self): + lines, _ = self._parse_log() + countries = {line.get("country_code") for line in lines} + countries.discard(None) + self.assertGreater(len(countries), 0) + + def test_extracts_client_info(self): + lines, _ = self._parse_log() + for line in lines[:3]: + self.assertIn("client_name", line) + self.assertIn("ip_address", line) diff --git a/metrics/tests/integration/test_classic_log_to_metrics.py b/metrics/tests/integration/test_classic_log_to_metrics.py new file mode 100644 index 0000000..6480bc7 --- /dev/null +++ b/metrics/tests/integration/test_classic_log_to_metrics.py @@ -0,0 +1,105 @@ +import unittest +from pathlib import Path + +from scielo_usage_counter.translator.classic import URLTranslatorClassicSite +from scielo_usage_counter.url_translator import URLTranslationManager + +from metrics.counter.access import accumulation, extraction, validation +from metrics.counter.indexing import converter as index_docs +from scielo_usage_counter import log_handler + +FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures" + + +class TestClassicLogToMetrics(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.robots_list = (FIXTURES_DIR / "counter-robots.txt").read_text().splitlines() + cls.mmdb_data = (FIXTURES_DIR / "map.mmdb").read_bytes() + cls.log_path = str(FIXTURES_DIR / "usage.log") + cls.utm = URLTranslationManager( + documents_metadata=iter([]), + sources_metadata=iter([]), + translator=URLTranslatorClassicSite, + ) + + def _parse_log(self): + parser = log_handler.LogParser( + mmdb_data=self.mmdb_data, + robots_list=self.robots_list, + output_mode="dict", + ) + parser.logfile = self.log_path + return list(parser.parse()), parser.stats + + def _full_pipeline(self): + lines, stats = self._parse_log() + results = {} + valid_count = 0 + + for line in lines: + url = line.get("url") + if not url: + continue + + translated = self.utm.translate(url) + if not translated or not isinstance(translated, dict): + continue + + counter_access = extraction.extract("scl", translated) + is_valid, _ = validation.is_valid(counter_access) + if not is_valid: + continue + + try: + accumulation.accumulate(results, counter_access, line) + valid_count += 1 + except (ValueError, Exception): + pass + + return results, lines, stats, valid_count + + def test_filters_static_resources(self): + lines, stats = self._parse_log() + self.assertLess(len(lines), 200) + + def test_filters_bots(self): + lines, stats = self._parse_log() + for line in lines: + self.assertNotEqual(line.get("client_name", "").lower(), "lockss cache") + + def test_produces_article_type_metrics(self): + results, _, _, valid_count = self._full_pipeline() + if not results: + self.skipTest("No valid lines in classic fixture for this translator") + return + + metrics = index_docs.convert(results) + + for doc in metrics["month"].values(): + self.assertEqual(doc["counter"]["data_type"], "Article") + self.assertEqual(doc["counter"]["metric_scope"], "item") + self.assertEqual(doc["document"]["type"], "article") + + def test_sets_journal_parent_data_type(self): + results, _, _, _ = self._full_pipeline() + if not results: + self.skipTest("No valid lines") + return + + metrics = index_docs.convert(results) + for doc in metrics["month"].values(): + source_type = doc.get("source", {}).get("type") + if source_type == "journal": + self.assertEqual(doc["counter"]["parent_data_type"], "Journal") + + def test_handles_truncated_user_agent(self): + lines, _ = self._parse_log() + self.assertGreater(len(lines), 0) + + def test_valid_lines_produce_session_ids(self): + results, _, _, _ = self._full_pipeline() + for value in results.values(): + self.assertIn("user_session_id", value) + self.assertIsNotNone(value["user_session_id"]) diff --git a/metrics/tests/integration/test_pipelines.py b/metrics/tests/integration/test_pipelines.py new file mode 100644 index 0000000..95e700f --- /dev/null +++ b/metrics/tests/integration/test_pipelines.py @@ -0,0 +1,129 @@ +import unittest +from datetime import datetime + +from scielo_usage_counter.values import ( + CONTENT_TYPE_ABSTRACT, + CONTENT_TYPE_FULL_TEXT, + MEDIA_FORMAT_HTML, +) + +from metrics.counter.access import accumulation, extraction +from metrics.counter.indexing import converter as index_docs + + +class TestPreprintPipeline(unittest.TestCase): + def _build_preprint_access(self, **overrides): + base = { + "pid_generic": "10.1590/SciELOPreprints.1234", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "media_language": "en", + } + base.update(overrides) + return extraction.extract("preprints", base) + + def test_extraction_sets_preprint_types(self): + data = self._build_preprint_access() + self.assertEqual(data["source_type"], "preprint_server") + self.assertEqual(data["document_type"], "preprint") + self.assertEqual(data["source_id"], "scielo-preprints") + + def test_full_pipeline_produces_preprint_article_version(self): + counter_access = self._build_preprint_access() + results = {} + line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "200.1.2.3", + "country_code": "BR", + "local_datetime": datetime(2024, 6, 15, 14, 30, 10), + } + accumulation.accumulate(results, counter_access, line) + metrics = index_docs.convert(results) + + month_docs = list(metrics["month"].values()) + self.assertEqual(len(month_docs), 1) + doc = month_docs[0] + self.assertEqual(doc["counter"]["data_type"], "Article") + self.assertEqual(doc["counter"]["article_version"], "Preprint") + self.assertEqual(doc["counter"]["metric_scope"], "item") + self.assertEqual(doc["document"]["type"], "preprint") + self.assertEqual(doc["document"]["id"], "10.1590/SCIELOPREPRINTS.1234") + self.assertEqual(doc["total_requests"], 1) + self.assertEqual(doc["unique_requests"], 1) + + +class TestDataversePipeline(unittest.TestCase): + def _build_dataset_access(self, **overrides): + base = { + "pid_generic": "10.48331/scielodata.abc123", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_ABSTRACT, + } + base.update(overrides) + return extraction.extract("data", base) + + def test_extraction_sets_dataset_types(self): + data = self._build_dataset_access() + self.assertEqual(data["source_type"], "data_repository") + self.assertEqual(data["document_type"], "dataset") + self.assertEqual(data["source_id"], "scielo-data") + + def test_full_pipeline_produces_dataset_metrics(self): + counter_access = self._build_dataset_access() + results = {} + line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "200.1.2.3", + "country_code": "BR", + "local_datetime": datetime(2024, 6, 15, 14, 30, 10), + } + accumulation.accumulate(results, counter_access, line) + metrics = index_docs.convert(results) + + month_docs = list(metrics["month"].values()) + self.assertEqual(len(month_docs), 1) + doc = month_docs[0] + self.assertEqual(doc["counter"]["data_type"], "Dataset") + self.assertNotIn("article_version", doc["counter"]) + self.assertEqual(doc["document"]["type"], "dataset") + self.assertEqual(doc["total_investigations"], 1) + self.assertEqual(doc["total_requests"], 0) + + +class TestOPACPipeline(unittest.TestCase): + def test_opac_article_produces_journal_article_metrics(self): + counter_access = extraction.extract( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "S1234-56782024000100001", + "article_title": "Test OPAC Article", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "media_language": "pt", + "journal_main_title": "Test Journal", + "journal_acronym": "testjou", + "journal_publisher_name": ["SciELO"], + }, + ) + + results = {} + line = { + "client_name": "Chrome", + "client_version": "120.0", + "ip_address": "189.10.20.30", + "country_code": "BR", + "local_datetime": datetime(2024, 3, 20, 8, 15, 42), + } + accumulation.accumulate(results, counter_access, line) + metrics = index_docs.convert(results) + + doc = list(metrics["month"].values())[0] + self.assertEqual(doc["counter"]["data_type"], "Article") + self.assertEqual(doc["counter"]["parent_data_type"], "Journal") + self.assertEqual(doc["document"]["type"], "article") + self.assertEqual(doc["source"]["type"], "journal") + self.assertEqual(doc["source"]["id"], "1234-5678") + self.assertEqual(doc["total_requests"], 1) diff --git a/metrics/tests/opensearch/__init__.py b/metrics/tests/opensearch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/test_opensearch.py b/metrics/tests/opensearch/test_client.py similarity index 64% rename from metrics/tests/test_opensearch.py rename to metrics/tests/opensearch/test_client.py index 49e21b3..9eb7ebc 100644 --- a/metrics/tests/test_opensearch.py +++ b/metrics/tests/opensearch/test_client.py @@ -3,26 +3,33 @@ from django.test import override_settings -from metrics import opensearch +from metrics.opensearch.client import OpenSearchUsageClient +from metrics.opensearch.mappings import ( + BOOKS_MONTH_INDEX_MAPPINGS, + BOOKS_YEAR_INDEX_MAPPINGS, + MONTH_INDEX_MAPPINGS, + YEAR_INDEX_MAPPINGS, + get_index_mappings, +) class OpenSearchUsageClientTests(TestCase): - @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client") + @patch.object(OpenSearchUsageClient, "get_opensearch_client") def test_create_index_sends_mappings_in_request_body(self, mock_get_client): mock_client = Mock() mock_get_client.return_value = mock_client - client = opensearch.OpenSearchUsageClient(url="https://example.org:9200") + client = OpenSearchUsageClient(url="https://example.org:9200") client.create_index( index_name="usage_monthly_books_202506", - mappings=opensearch.MONTH_INDEX_MAPPINGS, + mappings=MONTH_INDEX_MAPPINGS, ) mock_client.indices.create.assert_called_once_with( index="usage_monthly_books_202506", body={ "settings": {"index": {"number_of_replicas": 0}}, - "mappings": opensearch.MONTH_INDEX_MAPPINGS, + "mappings": MONTH_INDEX_MAPPINGS, }, ) @@ -33,7 +40,7 @@ def test_create_index_sends_mappings_in_request_body(self, mock_get_client): ) @patch("metrics.opensearch.client.OpenSearch") def test_verify_certs_false_explicitly_overrides_settings(self, mock_opensearch): - opensearch.OpenSearchUsageClient( + OpenSearchUsageClient( url="https://example.org:9200", verify_certs=False, ) @@ -45,23 +52,21 @@ def test_verify_certs_false_explicitly_overrides_settings(self, mock_opensearch) def test_get_index_mappings_returns_books_specific_mappings(self): self.assertIs( - opensearch.get_index_mappings("books", "month"), - opensearch.BOOKS_MONTH_INDEX_MAPPINGS, + get_index_mappings("books", "month"), + BOOKS_MONTH_INDEX_MAPPINGS, ) self.assertIs( - opensearch.get_index_mappings("books", "year"), - opensearch.BOOKS_YEAR_INDEX_MAPPINGS, - ) - self.assertIn("counter", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"]) - self.assertIn("access", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"]) - self.assertIn( - "applied_jobs", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"] + get_index_mappings("books", "year"), + BOOKS_YEAR_INDEX_MAPPINGS, ) + self.assertIn("counter", BOOKS_MONTH_INDEX_MAPPINGS["properties"]) + self.assertIn("access", BOOKS_YEAR_INDEX_MAPPINGS["properties"]) + self.assertIn("applied_jobs", BOOKS_MONTH_INDEX_MAPPINGS["properties"]) for mappings in ( - opensearch.MONTH_INDEX_MAPPINGS, - opensearch.YEAR_INDEX_MAPPINGS, - opensearch.BOOKS_MONTH_INDEX_MAPPINGS, - opensearch.BOOKS_YEAR_INDEX_MAPPINGS, + MONTH_INDEX_MAPPINGS, + YEAR_INDEX_MAPPINGS, + BOOKS_MONTH_INDEX_MAPPINGS, + BOOKS_YEAR_INDEX_MAPPINGS, ): for removed_field in ( "document_type", @@ -77,26 +82,27 @@ def test_get_index_mappings_returns_books_specific_mappings(self): ): self.assertNotIn(removed_field, mappings["properties"]) document_mapping = mappings["properties"]["document"] + source_mapping = mappings["properties"]["source"] self.assertEqual(document_mapping["properties"]["id"]["type"], "keyword") self.assertEqual(document_mapping["properties"]["title"]["type"], "text") + self.assertFalse(document_mapping["properties"]["title"]["index"]) + self.assertEqual(source_mapping["properties"]["id"]["type"], "keyword") + self.assertEqual(source_mapping["properties"]["title"]["type"], "text") + self.assertFalse(source_mapping["properties"]["title"]["index"]) self.assertEqual( - document_mapping["properties"]["title"]["fields"]["keyword"]["type"], - "keyword", - ) - self.assertEqual( - mappings["properties"]["source"]["properties"]["id"]["type"], - "keyword", + source_mapping["properties"]["publisher_name"]["type"], "text" ) + self.assertFalse(source_mapping["properties"]["publisher_name"]["index"]) @patch("metrics.opensearch.client.helpers.bulk") - @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client") + @patch.object(OpenSearchUsageClient, "get_opensearch_client") def test_increment_documents_for_daily_job_uses_applied_jobs( self, mock_get_client, mock_bulk, ): mock_get_client.return_value = Mock() - client = opensearch.OpenSearchUsageClient(url="https://example.org:9200") + client = OpenSearchUsageClient(url="https://example.org:9200") client.increment_documents_for_daily_job( index_name="usage_monthly_books_202506", diff --git a/metrics/tests/opensearch/test_names.py b/metrics/tests/opensearch/test_names.py new file mode 100644 index 0000000..f33dab1 --- /dev/null +++ b/metrics/tests/opensearch/test_names.py @@ -0,0 +1,23 @@ +import unittest + +from metrics.opensearch.names import generate_month_index_name, generate_year_index_name + + +class TestIndexNames(unittest.TestCase): + def test_generate_index_names_for_year_and_month(self): + self.assertEqual( + generate_year_index_name("usage", "scl", "2024-01-15"), + "usage_yearly_scl_2024", + ) + self.assertEqual( + generate_month_index_name("usage", "scl", "2024-01-15"), + "usage_monthly_scl_2024", + ) + self.assertEqual( + generate_year_index_name("usage", "books", "2024-01-15"), + "usage_yearly_books", + ) + self.assertEqual( + generate_month_index_name("usage", "books", "2024-01-15"), + "usage_monthly_books", + ) diff --git a/metrics/tests/parsing/__init__.py b/metrics/tests/parsing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/parsing/test_environment.py b/metrics/tests/parsing/test_environment.py new file mode 100644 index 0000000..b496779 --- /dev/null +++ b/metrics/tests/parsing/test_environment.py @@ -0,0 +1,39 @@ +from django.test import TestCase + +from metrics.services.parsing.environment import _get_translator_class + + +class TranslatorClassTests(TestCase): + def test_books_maps_to_books_translator(self): + cls = _get_translator_class("books") + self.assertEqual(cls.__name__, "URLTranslatorBooksSite") + + def test_classic_maps_to_classic_translator(self): + cls = _get_translator_class("classic") + self.assertEqual(cls.__name__, "URLTranslatorClassicSite") + + def test_opac_maps_to_opac_translator(self): + cls = _get_translator_class("opac") + self.assertEqual(cls.__name__, "URLTranslatorOPACSite") + + def test_opac_alpha_maps_to_opac_alpha_translator(self): + cls = _get_translator_class("opac_alpha") + self.assertEqual(cls.__name__, "URLTranslatorOPACAlphaSite") + + def test_preprints_maps_to_preprints_translator(self): + cls = _get_translator_class("preprints") + self.assertEqual(cls.__name__, "URLTranslatorPreprintsSite") + + def test_dataverse_maps_to_dataverse_translator(self): + cls = _get_translator_class("dataverse") + self.assertEqual(cls.__name__, "URLTranslatorDataverseSite") + + def test_unknown_name_returns_none(self): + self.assertIsNone(_get_translator_class("unknown")) + + def test_none_returns_none(self): + self.assertIsNone(_get_translator_class(None)) + + def test_case_insensitive(self): + cls = _get_translator_class("Books") + self.assertEqual(cls.__name__, "URLTranslatorBooksSite") diff --git a/metrics/tests/parsing/test_process_line.py b/metrics/tests/parsing/test_process_line.py new file mode 100644 index 0000000..549d93d --- /dev/null +++ b/metrics/tests/parsing/test_process_line.py @@ -0,0 +1,120 @@ +from datetime import date +from unittest.mock import Mock + +from django.test import TestCase +from scielo_usage_counter.values import CONTENT_TYPE_FULL_TEXT, MEDIA_FORMAT_HTML + +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from metrics.services.parsing.lines import process_line + + +class ProcessLineTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + self.log_file = LogFile.objects.create( + hash="1" * 32, + path="/tmp/111.log.gz", + stat_result={}, + status=choices.LOG_FILE_STATUS_QUEUED, + collection=self.collection, + date=date(2012, 3, 10), + validation={"probably_date": "2012-03-10"}, + ) + + def _fake_utm(self, translate_return=None, translate_error=None): + utm = Mock() + if translate_error: + utm.translate.side_effect = translate_error + else: + utm.translate.return_value = translate_return or { + "source_type": "book", + "source_id": "q7gtd", + "book_id": "q7gtd", + "pid_generic": "book:q7gtd", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + return utm + + def _line(self, **overrides): + base = { + "url": "/id/q7gtd", + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "local_datetime": None, + } + base.update(overrides) + return base + + def test_discards_invalid_local_datetime_without_raising(self): + results = {} + is_valid, error = process_line( + results=results, + line=self._line(), + utm=self._fake_utm(), + log_file=self.log_file, + ) + self.assertFalse(is_valid) + self.assertIsNone(error) + self.assertEqual(results, {}) + + def test_url_translation_error_returns_false_none(self): + results = {} + is_valid, error = process_line( + results=results, + line=self._line(), + utm=self._fake_utm(translate_error=ValueError("bad URL")), + log_file=self.log_file, + ) + self.assertFalse(is_valid) + self.assertIsNone(error) + + def test_valid_line_accumulates_result(self): + from datetime import datetime + + results = {} + is_valid, error = process_line( + results=results, + line=self._line(local_datetime=datetime(2024, 1, 15, 10, 0, 5)), + utm=self._fake_utm(), + log_file=self.log_file, + ) + self.assertTrue(is_valid) + self.assertIsNone(error) + self.assertEqual(len(results), 1) + + def test_validation_failure_without_track_errors_returns_no_discarded_line(self): + results = {} + utm = self._fake_utm( + translate_return={ + "pid_generic": "", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + ) + is_valid, error = process_line( + results=results, + line=self._line(), + utm=utm, + log_file=self.log_file, + track_errors=False, + ) + self.assertFalse(is_valid) + self.assertIsNone(error) + + def test_extraction_error_returns_false_none(self): + results = {} + utm = self._fake_utm(translate_return="not-a-dict") + is_valid, error = process_line( + results=results, + line=self._line(), + utm=utm, + log_file=self.log_file, + ) + self.assertFalse(is_valid) + self.assertIsNone(error) diff --git a/metrics/tests/services/__init__.py b/metrics/tests/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metrics/tests/test_cleanup.py b/metrics/tests/services/test_cleanup.py similarity index 95% rename from metrics/tests/test_cleanup.py rename to metrics/tests/services/test_cleanup.py index e08fa9c..4836cf8 100644 --- a/metrics/tests/test_cleanup.py +++ b/metrics/tests/services/test_cleanup.py @@ -115,9 +115,7 @@ def test_cleanup_skips_non_exported_jobs(self): paths = [] for i, status in enumerate(statuses): access_date = date(2012, 3, 10 + i) - path = daily_payloads.build_daily_storage_path( - self.collection, access_date - ) + path = daily_payloads.build_daily_storage_path(self.collection, access_date) resolved = self._write_payload_file(path) self._set_file_age(resolved, 30) paths.append(resolved) @@ -254,9 +252,11 @@ def setUp(self): self.collection = Collection.objects.create(acron3="books", acron2="bk") def test_task_cleanup_daily_payloads_calls_service(self): - with patch("metrics.services.daily_payloads.cleanup_exported_payloads") as mock_cleanup: + with patch( + "metrics.services.daily_payloads.cleanup_exported_payloads" + ) as mock_cleanup: mock_cleanup.return_value = 5 - from metrics.tasks import task_cleanup_daily_payloads + from metrics.tasks.cleanup import task_cleanup_daily_payloads result = task_cleanup_daily_payloads.run( collections=["books"], @@ -270,9 +270,11 @@ def test_task_cleanup_daily_payloads_calls_service(self): self.assertEqual(result, {"deleted_payloads": 5}) def test_task_cleanup_with_defaults(self): - with patch("metrics.services.daily_payloads.cleanup_exported_payloads") as mock_cleanup: + with patch( + "metrics.services.daily_payloads.cleanup_exported_payloads" + ) as mock_cleanup: mock_cleanup.return_value = 0 - from metrics.tasks import task_cleanup_daily_payloads + from metrics.tasks.cleanup import task_cleanup_daily_payloads result = task_cleanup_daily_payloads.run() diff --git a/metrics/tests/test_daily_jobs.py b/metrics/tests/services/test_daily_jobs.py similarity index 54% rename from metrics/tests/test_daily_jobs.py rename to metrics/tests/services/test_daily_jobs.py index f31b410..0413ba6 100644 --- a/metrics/tests/test_daily_jobs.py +++ b/metrics/tests/services/test_daily_jobs.py @@ -1,15 +1,20 @@ from datetime import date, timedelta +from types import SimpleNamespace +from unittest.mock import Mock, patch -from django.contrib.auth import get_user_model from django.test import TestCase from django.utils import timezone -from scielo_usage_counter.values import CONTENT_TYPE_FULL_TEXT, MEDIA_FORMAT_HTML from collection.models import Collection from log_manager import choices from log_manager.models import LogFile from metrics.models import DailyMetricJob -from metrics import services +from metrics.services.jobs import ( + create_or_update_daily_metric_job, + mark_daily_metric_job_exported, + release_stale_daily_metric_jobs, +) +from metrics.services.parsing.job_payloads import build_daily_metric_job_payload class DailyMetricJobServiceTests(TestCase): @@ -23,6 +28,7 @@ def _log_file(self, hash_value, status=choices.LOG_FILE_STATUS_QUEUED): stat_result={}, status=status, collection=self.collection, + date=date(2012, 3, 10), validation={"probably_date": "2012-03-10"}, ) @@ -39,7 +45,7 @@ def test_create_or_update_blocks_implicit_recompute_after_export(self): ) with self.assertRaises(RuntimeError): - services.create_or_update_daily_metric_job( + create_or_update_daily_metric_job( collection=self.collection, access_date=date(2012, 3, 10), log_files=[first, second], @@ -57,7 +63,7 @@ def test_create_or_update_keeps_payload_for_export_retry(self): summary={"month_document_count": 1}, ) - services.create_or_update_daily_metric_job( + create_or_update_daily_metric_job( collection=self.collection, access_date=date(2012, 3, 10), log_files=[log_file], @@ -69,7 +75,9 @@ def test_create_or_update_keeps_payload_for_export_retry(self): self.assertEqual(job.payload_hash, "abc") self.assertEqual(job.summary, {"month_document_count": 1}) - def test_create_or_update_clears_stale_payload_when_inputs_change_before_success(self): + def test_create_or_update_clears_stale_payload_when_inputs_change_before_success( + self, + ): first = self._log_file("1" * 32) second = self._log_file("2" * 32) job = DailyMetricJob.objects.create( @@ -82,7 +90,7 @@ def test_create_or_update_clears_stale_payload_when_inputs_change_before_success summary={"month_document_count": 1}, ) - services.create_or_update_daily_metric_job( + create_or_update_daily_metric_job( collection=self.collection, access_date=date(2012, 3, 10), log_files=[first, second], @@ -104,59 +112,99 @@ def test_release_stale_daily_metric_jobs_marks_logs_for_retry(self): export_started_at=timezone.now() - timedelta(minutes=120), ) - released = services.release_stale_daily_metric_jobs(stale_after_minutes=60) + released = release_stale_daily_metric_jobs(stale_after_minutes=60) log_file.refresh_from_db() self.assertEqual(released, 1) self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_ERROR) self.assertIsNone(log_file.parse_heartbeat_at) - def test_process_line_discards_invalid_local_datetime_without_raising(self): - class FakeUtm: - def translate(self, url): - return { - "book_id": "q7gtd", - "pid_generic": "book:q7gtd", - "media_language": "en", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - - log_file = self._log_file("1" * 32) - results = {} - - is_valid, error = services.process_line( - results=results, - line={ - "url": "/id/q7gtd", - "client_name": "browser", - "client_version": "1.0", - "ip_address": "127.0.0.1", - "country_code": "BR", - "local_datetime": None, - }, - utm=FakeUtm(), - log_file=log_file, + def test_mark_daily_metric_job_exported_sets_status_and_timestamp(self): + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, ) - self.assertFalse(is_valid) - self.assertIsNone(error) - self.assertEqual(results, {}) + mark_daily_metric_job_exported(job) - def test_mark_daily_metric_job_exported_records_updated_by(self): - user = get_user_model().objects.create_user( - username="tester", - email="tester@example.org", - password="secret", - ) + job.refresh_from_db() + self.assertEqual(job.status, DailyMetricJob.STATUS_EXPORTED) + self.assertIsNotNone(job.exported_at) + + @patch( + "metrics.services.parsing.job_payloads.daily_payloads.write_payload", + return_value="payload-hash", + ) + @patch( + "metrics.services.parsing.job_payloads.index_docs.convert", + return_value={"month": {}, "year": {}}, + ) + @patch( + "metrics.services.parsing.job_payloads.process_line", return_value=(True, None) + ) + @patch("metrics.services.parsing.job_payloads.setup_parsing_environment") + def test_build_daily_metric_job_payload_uses_only_input_log_hashes( + self, + mock_setup_parsing_environment, + mock_process_line, + mock_convert_documents, + mock_write_payload, + ): + selected = self._log_file("1" * 32) + extra = self._log_file("2" * 32) job = DailyMetricJob.objects.create( collection=self.collection, access_date=date(2012, 3, 10), status=DailyMetricJob.STATUS_EXPORTING, + input_log_hashes=[selected.hash], ) - services.mark_daily_metric_job_exported(job, user=user) + parser = Mock() + parser.stats = SimpleNamespace(lines_parsed=1) + parser.parse.return_value = [{"url": "/selected"}] + mock_setup_parsing_environment.return_value = (parser, Mock()) + + payload = build_daily_metric_job_payload( + job, robots_list=["robot"], mmdb=Mock(data={}) + ) + selected.refresh_from_db() + extra.refresh_from_db() job.refresh_from_db() - self.assertEqual(job.status, DailyMetricJob.STATUS_EXPORTED) - self.assertIsNotNone(job.exported_at) + + self.assertEqual(payload["input_log_hashes"], [selected.hash]) + self.assertEqual(job.input_log_hashes, [selected.hash]) + self.assertEqual(selected.status, choices.LOG_FILE_STATUS_PARSING) + self.assertEqual(extra.status, choices.LOG_FILE_STATUS_QUEUED) + mock_setup_parsing_environment.assert_called_once() + self.assertEqual( + mock_setup_parsing_environment.call_args.kwargs["log_file"].hash, + selected.hash, + ) + + def test_build_daily_metric_job_payload_rejects_empty_input_hashes(self): + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, + input_log_hashes=[], + ) + + with self.assertRaisesMessage(RuntimeError, "has no input log hashes"): + build_daily_metric_job_payload( + job, robots_list=["robot"], mmdb=Mock(data={}) + ) + + def test_build_daily_metric_job_payload_rejects_missing_input_hashes(self): + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, + input_log_hashes=["9" * 32], + ) + + with self.assertRaisesMessage(RuntimeError, "is missing log files"): + build_daily_metric_job_payload( + job, robots_list=["robot"], mmdb=Mock(data={}) + ) diff --git a/metrics/tests/test_tasks.py b/metrics/tests/services/test_tasks.py similarity index 59% rename from metrics/tests/test_tasks.py rename to metrics/tests/services/test_tasks.py index 5ffdaf0..abad24e 100644 --- a/metrics/tests/test_tasks.py +++ b/metrics/tests/services/test_tasks.py @@ -7,15 +7,21 @@ from collection.models import Collection from log_manager import choices from log_manager.models import LogFile -from metrics import tasks from metrics.models import DailyMetricJob +from metrics.tasks.log_parsing import ( + task_enqueue_log_parsing_jobs, + task_wait_log_parsing_wave, +) +from metrics.tasks.resume import task_resume_log_exports class ParseLogsTaskTests(TestCase): def setUp(self): self.collection = Collection.objects.create(acron3="books", acron2="bk") - def _log_file(self, hash_value, probably_date, status=choices.LOG_FILE_STATUS_QUEUED): + def _log_file( + self, hash_value, probably_date, status=choices.LOG_FILE_STATUS_QUEUED + ): return LogFile.objects.create( hash=hash_value, path=f"/tmp/{hash_value}.log.gz", @@ -26,13 +32,17 @@ def _log_file(self, hash_value, probably_date, status=choices.LOG_FILE_STATUS_QU validation={"probably_date": probably_date}, ) - def test_task_parse_logs_enqueues_one_daily_job_per_collection_date(self): + def test_task_enqueue_log_parsing_jobs_enqueues_one_daily_job_per_collection_date( + self, + ): first = self._log_file("1" * 32, "2012-03-10") second = self._log_file("2" * 32, "2012-03-10") third = self._log_file("3" * 32, "2012-03-15") - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - result = tasks.task_parse_logs.run( + with patch( + "metrics.tasks.log_parsing.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_enqueue_log_parsing_jobs.run( collections=["books"], include_logs_with_error=False, from_date="2012-03-01", @@ -40,17 +50,24 @@ def test_task_parse_logs_enqueues_one_daily_job_per_collection_date(self): ) self.assertEqual(result["enqueued_jobs"], 2) + self.assertEqual(result["enqueued_logs"], 3) self.assertEqual(mocked_apply_async.call_count, 2) jobs = list(DailyMetricJob.objects.order_by("access_date")) - self.assertEqual([job.access_date for job in jobs], [date(2012, 3, 10), date(2012, 3, 15)]) + self.assertEqual( + [job.access_date for job in jobs], [date(2012, 3, 10), date(2012, 3, 15)] + ) self.assertEqual(jobs[0].input_log_hashes, sorted([first.hash, second.hash])) self.assertEqual(jobs[1].input_log_hashes, [third.hash]) - def test_task_parse_logs_allows_queue_override_and_robots_source(self): + def test_task_enqueue_log_parsing_jobs_allows_queue_override_and_robots_source( + self, + ): self._log_file("1" * 32, "2012-03-10") - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - tasks.task_parse_logs.run( + with patch( + "metrics.tasks.log_parsing.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + task_enqueue_log_parsing_jobs.run( collections=["books"], include_logs_with_error=False, from_date="2012-03-01", @@ -60,15 +77,46 @@ def test_task_parse_logs_allows_queue_override_and_robots_source(self): ) mocked_apply_async.assert_called_once() - self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult") + self.assertEqual( + mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult" + ) self.assertEqual(mocked_apply_async.call_args.kwargs["args"][-1], "counter") - def test_task_parse_logs_skip_log_hashes_prevents_reprocessing_same_auto_run(self): - skipped = self._log_file("1" * 32, "2012-03-10", status=choices.LOG_FILE_STATUS_ERROR) - queued = self._log_file("2" * 32, "2012-03-11") + def test_task_enqueue_log_parsing_jobs_excludes_error_logs_when_not_requested(self): + queued = self._log_file("1" * 32, "2012-03-10") + error = self._log_file( + "2" * 32, "2012-03-10", status=choices.LOG_FILE_STATUS_ERROR + ) + + with patch( + "metrics.tasks.log_parsing.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_enqueue_log_parsing_jobs.run( + collections=["books"], + include_logs_with_error=False, + from_date="2012-03-01", + until_date="2012-03-31", + ) + + mocked_apply_async.assert_called_once() + job = DailyMetricJob.objects.get() + self.assertEqual(job.input_log_hashes, [queued.hash]) + self.assertNotIn(error.hash, job.input_log_hashes) + self.assertEqual(result["enqueued_logs"], 1) + self.assertEqual(result["enqueued_jobs"], 1) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - result = tasks.task_parse_logs.run( + def test_task_enqueue_log_parsing_jobs_skip_log_hashes_prevents_reprocessing_same_auto_run( + self, + ): + skipped = self._log_file( + "1" * 32, "2012-03-10", status=choices.LOG_FILE_STATUS_ERROR + ) + queued = self._log_file("2" * 32, "2012-03-10") + + with patch( + "metrics.tasks.log_parsing.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_enqueue_log_parsing_jobs.run( collections=["books"], include_logs_with_error=True, from_date="2012-03-01", @@ -80,17 +128,45 @@ def test_task_parse_logs_skip_log_hashes_prevents_reprocessing_same_auto_run(sel job = DailyMetricJob.objects.get() self.assertEqual(job.input_log_hashes, [queued.hash]) self.assertEqual(result["enqueued_jobs"], 1) + self.assertEqual(result["enqueued_logs"], 1) - def test_wait_parse_logs_wave_rechecks_until_daily_jobs_complete(self): + def test_task_enqueue_log_parsing_jobs_max_log_files_counts_files_not_jobs(self): + first = self._log_file("1" * 32, "2012-03-10") + second = self._log_file("2" * 32, "2012-03-10") + + with patch( + "metrics.tasks.log_parsing.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_enqueue_log_parsing_jobs.run( + collections=["books"], + include_logs_with_error=False, + max_log_files=1, + from_date="2012-03-01", + until_date="2012-03-31", + ) + + mocked_apply_async.assert_called_once() + job = DailyMetricJob.objects.get() + self.assertEqual(job.input_log_hashes, [first.hash]) + self.assertNotIn(second.hash, job.input_log_hashes) + self.assertEqual(result["enqueued_logs"], 1) + self.assertEqual(result["enqueued_jobs"], 1) + self.assertTrue(result["reached_max_log_files"]) + + def test_wait_log_parsing_wave_rechecks_until_daily_jobs_complete(self): job = DailyMetricJob.objects.create( collection=self.collection, access_date=date(2012, 3, 10), status=DailyMetricJob.STATUS_EXPORTING, ) - with patch("metrics.tasks.task_wait_parse_logs_wave.apply_async") as mocked_wait_apply_async: - with patch("metrics.tasks.task_parse_logs.apply_async") as mocked_parse_logs_apply_async: - result = tasks.task_wait_parse_logs_wave.run( + with patch( + "metrics.tasks.log_parsing.task_wait_log_parsing_wave.apply_async" + ) as mocked_wait_apply_async: + with patch( + "metrics.tasks.log_parsing.task_enqueue_log_parsing_jobs.apply_async" + ) as mocked_parse_logs_apply_async: + result = task_wait_log_parsing_wave.run( wave_log_hashes=[job.pk], collections=["books"], include_logs_with_error=False, @@ -98,26 +174,34 @@ def test_wait_parse_logs_wave_rechecks_until_daily_jobs_complete(self): auto_reexecute=True, ) - self.assertEqual(result, {"wave_completed": False, "reexecution_enqueued": False}) + self.assertEqual( + result, {"wave_completed": False, "reexecution_enqueued": False} + ) mocked_parse_logs_apply_async.assert_not_called() mocked_wait_apply_async.assert_called_once() - def test_wait_parse_logs_wave_preserves_queue_name(self): + def test_wait_log_parsing_wave_preserves_queue_name(self): job = DailyMetricJob.objects.create( collection=self.collection, access_date=date(2012, 3, 10), status=DailyMetricJob.STATUS_EXPORTING, ) - with patch("metrics.tasks.task_wait_parse_logs_wave.apply_async") as mocked_wait_apply_async: - result = tasks.task_wait_parse_logs_wave.run( + with patch( + "metrics.tasks.log_parsing.task_wait_log_parsing_wave.apply_async" + ) as mocked_wait_apply_async: + result = task_wait_log_parsing_wave.run( wave_log_hashes=[job.pk], collections=["books"], queue_name="parse_small", ) - self.assertEqual(result, {"wave_completed": False, "reexecution_enqueued": False}) - self.assertEqual(mocked_wait_apply_async.call_args.kwargs["queue"], "parse_small") + self.assertEqual( + result, {"wave_completed": False, "reexecution_enqueued": False} + ) + self.assertEqual( + mocked_wait_apply_async.call_args.kwargs["queue"], "parse_small" + ) class ResumeDailyMetricJobTests(TestCase): @@ -140,8 +224,10 @@ def test_resume_log_exports_requeues_error_daily_jobs(self): input_log_hashes=[log_file.hash], ) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - result = tasks.task_resume_log_exports.run( + with patch( + "metrics.tasks.resume.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_resume_log_exports.run( collections=["books"], from_date="2012-03-01", until_date="2012-03-31", @@ -150,7 +236,9 @@ def test_resume_log_exports_requeues_error_daily_jobs(self): mocked_apply_async.assert_called_once() self.assertEqual(mocked_apply_async.call_args.kwargs["args"][0], job.pk) - self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult") + self.assertEqual( + mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult" + ) self.assertEqual(result["resumed_logs"], 1) def test_resume_log_exports_clears_payload_when_current_logs_change(self): @@ -172,8 +260,10 @@ def test_resume_log_exports_clears_payload_when_current_logs_change(self): summary={"month_document_count": 1}, ) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async"): - tasks.task_resume_log_exports.run( + with patch( + "metrics.tasks.resume.task_build_and_export_daily_metric_job.apply_async" + ): + task_resume_log_exports.run( collections=["books"], from_date="2012-03-01", until_date="2012-03-31", @@ -204,8 +294,10 @@ def test_resume_log_exports_preserves_payload_when_current_logs_match(self): summary={"month_document_count": 1}, ) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async"): - tasks.task_resume_log_exports.run( + with patch( + "metrics.tasks.resume.task_build_and_export_daily_metric_job.apply_async" + ): + task_resume_log_exports.run( collections=["books"], from_date="2012-03-01", until_date="2012-03-31", @@ -226,8 +318,10 @@ def test_resume_log_exports_requeues_stored_payload_without_current_logs(self): payload_hash="abc", ) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - result = tasks.task_resume_log_exports.run( + with patch( + "metrics.tasks.resume.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_resume_log_exports.run( collections=["books"], from_date="2012-03-01", until_date="2012-03-31", @@ -244,8 +338,10 @@ def test_resume_log_exports_skips_jobs_without_logs_or_payload(self): status=DailyMetricJob.STATUS_ERROR, ) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - result = tasks.task_resume_log_exports.run( + with patch( + "metrics.tasks.resume.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_resume_log_exports.run( collections=["books"], from_date="2012-03-01", until_date="2012-03-31", @@ -271,13 +367,15 @@ def test_resume_log_exports_releases_stale_exporting_jobs(self): export_started_at=timezone.now() - timedelta(minutes=120), ) - with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: - result = tasks.task_resume_log_exports.run( + with patch( + "metrics.tasks.resume.task_build_and_export_daily_metric_job.apply_async" + ) as mocked_apply_async: + result = task_resume_log_exports.run( collections=["books"], from_date="2012-03-01", until_date="2012-03-31", stale_after_minutes=60, - ) + ) job.refresh_from_db() self.assertEqual(job.status, DailyMetricJob.STATUS_PENDING) diff --git a/metrics/tests/test_index_utils.py b/metrics/tests/test_index_utils.py deleted file mode 100644 index e0f0aef..0000000 --- a/metrics/tests/test_index_utils.py +++ /dev/null @@ -1,1000 +0,0 @@ -import csv -import unittest -from datetime import datetime -from pathlib import Path -from tempfile import TemporaryDirectory - -from scielo_usage_counter.values import ( - CONTENT_TYPE_ABSTRACT, - CONTENT_TYPE_FULL_TEXT, - CONTENT_TYPE_UNDEFINED, - DEFAULT_SCIELO_ISSN, - MEDIA_FORMAT_HTML, - MEDIA_FORMAT_PDF, - MEDIA_FORMAT_UNDEFINED, -) - -from metrics.counter import access -from metrics.counter import documents as index_docs -from metrics.opensearch.names import generate_month_index_name, generate_year_index_name - - -class TestIndexUtils(unittest.TestCase): - def test_is_valid_item_access_data_valid(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "en", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertTrue(result) - - def test_is_valid_item_access_data_missing_scielo_issn(self): - data = { - "scielo_issn": "", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "en", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertFalse(result) - - def test_is_valid_item_access_data_valid_book_source(self): - data = { - "source_type": "book", - "source_id": "q7gtd", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_generic": "BOOK:Q7GTD", - "media_language": "en", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertTrue(result) - - def test_is_valid_item_access_data_undefined_media_format(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "en", - "media_format": MEDIA_FORMAT_UNDEFINED, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertFalse(result) - - def test_is_valid_item_access_data_undefined_content_type(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "en", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_UNDEFINED, - } - result, _ = access.is_valid_item_access_data(data) - self.assertFalse(result) - - def test_is_valid_item_access_data_missing_pid_v2_and_pid_v3(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "", - "pid_v3": "", - "media_language": "en", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertFalse(result) - - def test_is_valid_item_access_data_media_format_html(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "en", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertTrue(result) - - def test_is_valid_item_access_data_content_type_abstract(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "en", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_ABSTRACT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertTrue(result) - - def test_is_valid_item_access_data_dataset_without_source_or_language_is_valid( - self, - ): - data = { - "document_type": "dataset", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_v2": None, - "pid_v3": None, - "pid_generic": "DOI:10.48331/SCIELODATA.JLMAIY", - "media_language": "un", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_ABSTRACT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertTrue(result) - - def test_is_valid_item_access_data_missing_media_language_is_invalid(self): - data = { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_language": "", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - result, _ = access.is_valid_item_access_data(data) - self.assertFalse(result) - - def test_extract_item_access_data_normalizes_source_fields_for_journal(self): - data = access.extract_item_access_data( - "scl", - { - "scielo_issn": "1234-5678", - "pid_v2": "S0102-67202020000100001", - "media_language": "en", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - "publication_year": "2024", - "journal_main_title": "Journal Title", - "journal_subject_area_capes": ["Health Sciences"], - "journal_subject_area_wos": ["Medicine"], - "journal_acronym": "testjou", - "journal_publisher_name": ["SciELO"], - }, - ) - - self.assertEqual(data["source_type"], "journal") - self.assertEqual(data["source_id"], "1234-5678") - self.assertEqual(data["source_main_title"], "Journal Title") - self.assertEqual(data["source_acronym"], "testjou") - - def test_extract_item_access_data_normalizes_source_fields_for_books(self): - data = access.extract_item_access_data( - "books", - { - "book_id": "q7gtd", - "book_title": "Book Title", - "title_pid_generic": "book:q7gtd", - "pid_generic": "book:q7gtd/chapter:03", - "media_language": "en", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - "publication_year": "2023", - }, - ) - - self.assertEqual(data["source_type"], "book") - self.assertEqual(data["source_id"], "q7gtd") - self.assertEqual(data["scielo_issn"], DEFAULT_SCIELO_ISSN) - self.assertEqual(data["source_main_title"], "Book Title") - self.assertEqual(data["title_pid_generic"], "BOOK:Q7GTD") - - def test_extract_item_access_data_preserves_access_url_and_free_to_read(self): - data = access.extract_item_access_data( - "books", - { - "book_id": "c2248", - "book_title": "Book Title", - "title_pid_generic": "book:c2248", - "pid_generic": "book:c2248", - "media_language": "pt", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_url": "/id/c2248/pdf/freitas-9788599662830.pdf", - "source_access_type": "free_to_read", - }, - ) - - self.assertEqual(data["access_url"], "/id/c2248/pdf/freitas-9788599662830.pdf") - self.assertEqual(data["counter_access_type"], "Free_To_Read") - - def test_extract_item_access_data_tolerates_malformed_media_language(self): - data = access.extract_item_access_data( - "books", - { - "book_id": "q7gtd", - "pid_generic": "book:q7gtd", - "media_language": "'", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - }, - ) - - self.assertEqual(data["media_language"], "un") - - def test_extract_item_access_data_sets_document_title_by_type(self): - chapter = access.extract_item_access_data( - "books", - { - "book_id": "q7gtd", - "chapter_id": "03", - "pid_generic": "book:q7gtd/chapter:03", - "book_title": "Book Title", - "chapter_title": "Chapter Title", - "media_format": MEDIA_FORMAT_HTML, - "media_language": "en", - "content_type": CONTENT_TYPE_FULL_TEXT, - }, - ) - book = access.extract_item_access_data( - "books", - { - "book_id": "q7gtd", - "pid_generic": "book:q7gtd", - "book_title": "Book Title", - "media_format": MEDIA_FORMAT_HTML, - "media_language": "en", - "content_type": CONTENT_TYPE_FULL_TEXT, - }, - ) - article = access.extract_item_access_data( - "scl", - { - "scielo_issn": "1234-5678", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "article_title": "Article Title", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - }, - ) - - self.assertEqual(chapter["document_title"], "Chapter Title") - self.assertEqual(book["document_title"], "Book Title") - self.assertEqual(article["document_title"], "Article Title") - - def test_extract_item_access_data_normalizes_scielo_collection_document_types(self): - preprint = access.extract_item_access_data( - "preprints", - { - "pid_generic": "10.1590/SciELOPreprints.1234", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - }, - ) - dataset = access.extract_item_access_data( - "data", - { - "pid_generic": "10.48331/scielodata.abc123", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_ABSTRACT, - }, - ) - article = access.extract_item_access_data( - "scl", - { - "scielo_issn": "1234-5678", - "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - }, - ) - - self.assertEqual(preprint["source_type"], "preprint_server") - self.assertEqual(preprint["document_type"], "preprint") - self.assertEqual(dataset["source_type"], "data_repository") - self.assertEqual(dataset["document_type"], "dataset") - self.assertEqual(article["source_type"], "journal") - self.assertEqual(article["document_type"], "article") - - def test_update_results_with_item_access_data_stores_source_and_periods(self): - results = {} - item_access_data = { - "collection": "books", - "source_type": "book", - "source_id": "q7gtd", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:Q7GTD", - "title_pid_generic": "BOOK:Q7GTD", - "media_language": "en", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - "publication_year": "2023", - "document_title": "Book Title", - "source_main_title": "Book Title", - "source_subject_area_capes": [], - "source_subject_area_wos": [], - "source_acronym": None, - "source_publisher_name": ["SciELO Books"], - } - line = { - "client_name": "browser", - "client_version": "1.0", - "ip_address": "127.0.0.1", - "country_code": "BR", - "local_datetime": datetime(2024, 1, 15, 10, 0, 5), - } - - access.update_results_with_item_access_data(results, item_access_data, line) - - self.assertEqual(len(results), 1) - result = next(iter(results.values())) - self.assertEqual(result["source"]["source_type"], "book") - self.assertEqual(result["source"]["source_id"], "q7gtd") - self.assertEqual(result["source"]["main_title"], "Book Title") - self.assertEqual(result["access_date"], "2024-01-15") - self.assertEqual(result["access_month"], "202401") - self.assertEqual(result["access_year"], "2024") - self.assertEqual(result["access_country_code"], "BR") - self.assertEqual(result["content_language"], "en") - self.assertEqual(result["title_pid_generic"], "BOOK:Q7GTD") - self.assertEqual(result["document"], {"title": "Book Title"}) - self.assertIn("user_session_id", result) - - def test_update_results_with_item_access_data_rejects_invalid_local_datetime(self): - results = {} - item_access_data = { - "collection": "books", - "source_type": "book", - "source_id": "q7gtd", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_generic": "BOOK:Q7GTD", - "media_language": "en", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - } - line = { - "client_name": "browser", - "client_version": "1.0", - "ip_address": "127.0.0.1", - "country_code": "BR", - "local_datetime": None, - } - - with self.assertRaises(ValueError): - access.update_results_with_item_access_data(results, item_access_data, line) - - self.assertEqual(results, {}) - - def test_update_results_with_item_access_data_does_not_expand_book_into_segments( - self, - ): - results = {} - item_access_data = { - "collection": "books", - "source_type": "book", - "source_id": "c2248", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248", - "title_pid_generic": "BOOK:C2248", - "segment_pid_generics": [ - "BOOK:C2248/CHAPTER:00", - "BOOK:C2248/CHAPTER:01", - "BOOK:C2248/CHAPTER:02", - ], - "media_language": "pt", - "media_format": MEDIA_FORMAT_PDF, - "content_type": CONTENT_TYPE_FULL_TEXT, - "publication_year": "2018", - "source_main_title": "C2248 Book", - } - line = { - "client_name": "browser", - "client_version": "1.0", - "ip_address": "127.0.0.1", - "country_code": "BR", - "local_datetime": datetime(2024, 1, 15, 10, 0, 5), - } - - access.update_results_with_item_access_data(results, item_access_data, line) - - self.assertEqual(len(results), 1) - result = list(results.values())[0] - self.assertEqual(result["pid_generic"], "BOOK:C2248") - - def test_double_click_filter_uses_url_bucket_for_same_item(self): - results = {} - item_access_data = { - "collection": "books", - "source_type": "book", - "source_id": "c2248", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248/CHAPTER:03", - "title_pid_generic": "BOOK:C2248", - "media_language": "pt", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - "publication_year": "2018", - "source_main_title": "C2248 Book", - } - base_line = { - "client_name": "browser", - "client_version": "1.0", - "ip_address": "127.0.0.1", - "country_code": "BR", - } - - access.update_results_with_item_access_data( - results, - item_access_data, - { - **base_line, - "local_datetime": datetime(2024, 1, 15, 10, 0, 5), - "url": "/id/c2248/03", - }, - ) - access.update_results_with_item_access_data( - results, - item_access_data, - { - **base_line, - "local_datetime": datetime(2024, 1, 15, 10, 0, 20), - "url": "https://books.scielo.org/id/c2248/epub/03.html?x=1", - }, - ) - - raw = next(iter(results.values())) - self.assertEqual( - set(raw["click_timestamps_by_url"]), - {"/id/c2248/03", "/id/c2248/epub/03.html"}, - ) - - metrics_data = index_docs.convert_raw_results_to_index_documents(results) - month_item = metrics_data["month"][ - "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" - ] - - self.assertEqual(month_item["total_requests"], 2) - self.assertEqual(month_item["unique_requests"], 1) - - def test_double_click_filter_collapses_same_url_within_30_seconds(self): - results = {} - item_access_data = { - "collection": "books", - "source_type": "book", - "source_id": "c2248", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248/CHAPTER:03", - "title_pid_generic": "BOOK:C2248", - "media_language": "pt", - "media_format": MEDIA_FORMAT_HTML, - "content_type": CONTENT_TYPE_FULL_TEXT, - "publication_year": "2018", - "source_main_title": "C2248 Book", - } - base_line = { - "client_name": "browser", - "client_version": "1.0", - "ip_address": "127.0.0.1", - "country_code": "BR", - "url": "/id/c2248/03?from=search", - } - - access.update_results_with_item_access_data( - results, - item_access_data, - {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 5)}, - ) - access.update_results_with_item_access_data( - results, - item_access_data, - {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 20)}, - ) - - raw = next(iter(results.values())) - self.assertEqual( - raw["click_timestamps_by_url"], - {"/id/c2248/03": {"00:05": 1, "00:20": 1}}, - ) - - metrics_data = index_docs.convert_raw_results_to_index_documents(results) - month_item = metrics_data["month"][ - "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" - ] - - self.assertEqual(month_item["total_requests"], 1) - self.assertEqual(month_item["unique_requests"], 1) - - def test_generate_index_names_for_year_and_month(self): - self.assertEqual( - generate_year_index_name("usage", "scl", "2024-01-15"), - "usage_yearly_scl_2024", - ) - self.assertEqual( - generate_month_index_name("usage", "scl", "2024-01-15"), - "usage_monthly_scl_2024", - ) - self.assertEqual( - generate_year_index_name("usage", "books", "2024-01-15"), - "usage_yearly_books", - ) - self.assertEqual( - generate_month_index_name("usage", "books", "2024-01-15"), - "usage_monthly_books", - ) - - def test_convert_raw_results_to_index_documents_creates_month_and_year_views(self): - data = { - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|browser|1.0|127.0.0.1|BR|en|html|full_text": { - "collection": "books", - "source_key": "q7gtd", - "document_type": "chapter", - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:Q7GTD/CHAPTER:03", - "document": {"title": "Chapter Title"}, - "title_pid_generic": "BOOK:Q7GTD", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "en", - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_month": "202401", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "q7gtd", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "main_title": "Book Title", - "identifiers": { - "book_id": "q7gtd", - "isbn": "9788578791889", - }, - "city": "Sao Paulo", - "country": "BR", - "subject_area_capes": [], - "subject_area_wos": [], - "acronym": None, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2023", - } - } - - metrics_data = index_docs.convert_raw_results_to_index_documents(data) - - self.assertEqual(set(metrics_data.keys()), {"month", "year"}) - self.assertEqual(len(metrics_data["month"]), 2) - self.assertEqual(len(metrics_data["year"]), 2) - - month_item = metrics_data["month"][ - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|2024-01|Open|Regular|2023" - ] - self.assertEqual(month_item["access"], {"month": "2024-01"}) - self.assertIn("daily_metrics", month_item) - self.assertNotIn("by_day", month_item) - self.assertNotIn("access_country_code", month_item) - self.assertNotIn("content_language", month_item) - self.assertEqual(month_item["document"]["id"], "BOOK:Q7GTD/CHAPTER:03") - self.assertEqual(month_item["document"]["type"], "chapter") - self.assertEqual(month_item["document"]["title"], "Chapter Title") - self.assertEqual(month_item["document"]["parent_id"], "BOOK:Q7GTD") - self.assertEqual(month_item["document"]["publication_year"], "2023") - self.assertEqual(month_item["document"]["identifiers"]["book_id"], "q7gtd") - self.assertEqual(month_item["document"]["identifiers"]["chapter_id"], "03") - self.assertEqual(month_item["document"]["identifiers"]["isbn"], "9788578791889") - self.assertNotIn("pid_generic", month_item["document"]["identifiers"]) - self.assertEqual(month_item["counter"]["metric_scope"], "item") - self.assertEqual(month_item["counter"]["data_type"], "Book_Segment") - self.assertEqual(month_item["total_requests"], 1) - self.assertEqual(month_item["unique_requests"], 1) - self.assertNotIn("scielo_issn", month_item["source"]) - self.assertNotIn("book_id", month_item["source"]["identifiers"]) - self.assertEqual(month_item["source"]["publisher_name"], ["SciELO Books"]) - - month_title = metrics_data["month"][ - "title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023" - ] - self.assertEqual(month_title["document"]["id"], "BOOK:Q7GTD") - self.assertEqual(month_title["document"]["type"], "book") - self.assertEqual(month_title["document"]["title"], "Book Title") - self.assertNotIn("parent_id", month_title["document"]) - self.assertEqual(month_title["counter"]["metric_scope"], "title") - self.assertEqual(month_title["counter"]["data_type"], "Book") - self.assertEqual(month_title["total_requests"], 1) - self.assertEqual(month_title["total_investigations"], 1) - self.assertEqual(month_title["unique_requests"], 1) - self.assertEqual(month_title["unique_investigations"], 1) - - year_item = metrics_data["year"][ - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|en|BR|2024|Open|Regular|2023" - ] - self.assertEqual( - year_item["access"], - {"year": "2024", "country_code": "BR", "content_language": "en"}, - ) - self.assertNotIn("daily_metrics", year_item) - self.assertNotIn("by_day", year_item) - self.assertNotIn("access_month", year_item) - self.assertEqual(year_item["document"]["title"], "Chapter Title") - self.assertEqual(year_item["counter"]["metric_scope"], "item") - self.assertEqual(year_item["total_requests"], 1) - - year_title = metrics_data["year"][ - "title|books|q7gtd|||BOOK:Q7GTD|en|BR|2024|Open|Regular|2023" - ] - self.assertEqual(year_title["counter"]["metric_scope"], "title") - self.assertEqual(year_title["document"]["title"], "Book Title") - self.assertNotIn("daily_metrics", year_title) - self.assertNotIn("by_day", year_title) - self.assertNotIn("access_month", year_title) - self.assertEqual(year_title["total_requests"], 1) - self.assertEqual(year_title["total_investigations"], 1) - self.assertEqual(year_title["unique_requests"], 1) - self.assertEqual(year_title["unique_investigations"], 1) - - def test_convert_raw_results_to_index_documents_maps_counter_data_types(self): - data = { - "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|sess|BR|un|html|full_text": { - "collection": "preprints", - "source_key": "scielo-preprints", - "document_type": "preprint", - "pid_generic": "10.1590/SCIELOPREPRINTS.1234", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "un", - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_year": "2024", - "source": { - "source_type": "preprint_server", - "source_id": "scielo-preprints", - "main_title": "SciELO Preprints", - }, - "publication_year": "2024", - }, - "data|scielo-data|||10.48331/SCIELODATA.ABC123|sess|BR|un|html|abstract": { - "collection": "data", - "source_key": "scielo-data", - "document_type": "dataset", - "pid_generic": "10.48331/SCIELODATA.ABC123", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "un", - "content_type": CONTENT_TYPE_ABSTRACT, - "access_date": "2024-01-15", - "access_year": "2024", - "source": { - "source_type": "data_repository", - "source_id": "scielo-data", - "main_title": "SciELO Data", - }, - "publication_year": "2024", - }, - } - - metrics_data = index_docs.convert_raw_results_to_index_documents(data) - preprint_doc = metrics_data["month"][ - "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|2024-01|Open|Regular|2024" - ] - dataset_doc = metrics_data["month"][ - "data|scielo-data|||10.48331/SCIELODATA.ABC123|2024-01|Open|Regular|2024" - ] - - self.assertEqual(preprint_doc["counter"]["data_type"], "Article") - self.assertEqual(preprint_doc["document"]["type"], "preprint") - self.assertEqual(preprint_doc["document"]["id"], "10.1590/SCIELOPREPRINTS.1234") - self.assertNotIn("pid_generic", preprint_doc["document"].get("identifiers", {})) - self.assertNotIn("scielo_document_type", preprint_doc) - self.assertEqual(preprint_doc["counter"]["article_version"], "Preprint") - self.assertEqual(dataset_doc["counter"]["data_type"], "Dataset") - self.assertNotIn("article_version", dataset_doc["counter"]) - - def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_formats( - self, - ): - data = { - "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|html|full_text": { - "collection": "books", - "source_key": "c2248", - "document_type": "chapter", - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248/CHAPTER:03", - "title_pid_generic": "BOOK:C2248", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "pt", - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_month": "202401", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "c2248", - "main_title": "C2248 Book", - "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2018", - }, - "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": { - "collection": "books", - "source_key": "c2248", - "document_type": "chapter", - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248/CHAPTER:03", - "title_pid_generic": "BOOK:C2248", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:45": 1}, - "access_country_code": "BR", - "content_language": "pt", - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_month": "202401", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "c2248", - "main_title": "C2248 Book", - "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2018", - }, - } - - metrics_data = index_docs.convert_raw_results_to_index_documents(data) - - month_item = metrics_data["month"][ - "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" - ] - month_title = metrics_data["month"][ - "title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" - ] - - self.assertEqual(month_item["total_requests"], 2) - self.assertEqual(month_item["total_investigations"], 2) - self.assertEqual(month_item["unique_requests"], 1) - self.assertEqual(month_item["unique_investigations"], 1) - self.assertEqual(month_title["unique_requests"], 1) - self.assertEqual(month_title["unique_investigations"], 1) - - def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_item_scope( - self, - ): - data = { - "books|c2248|||BOOK:C2248|sess|BR|pt|html|abstract": { - "collection": "books", - "source_key": "c2248", - "document_type": "book", - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248", - "document": {"title": "C2248 Book"}, - "title_pid_generic": "BOOK:C2248", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "pt", - "content_type": CONTENT_TYPE_ABSTRACT, - "access_date": "2024-01-15", - "access_month": "202401", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "c2248", - "main_title": "C2248 Book", - "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2018", - }, - } - - metrics_data = index_docs.convert_raw_results_to_index_documents(data) - - self.assertEqual( - set(metrics_data["month"].keys()), - {"title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"}, - ) - self.assertEqual( - set(metrics_data["year"].keys()), - {"title|books|c2248|||BOOK:C2248|pt|BR|2024|Open|Regular|2018"}, - ) - - def test_convert_raw_results_to_index_documents_counts_whole_book_without_segments_as_book_segment( - self, - ): - data = { - "books|c2248|||BOOK:C2248|sess|BR|pt|pdf|full_text": { - "collection": "books", - "source_key": "c2248", - "document_type": "book", - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248", - "document": {"title": "C2248 Book"}, - "title_pid_generic": "BOOK:C2248", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "pt", - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_month": "202401", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "c2248", - "main_title": "C2248 Book", - "identifiers": {"book_id": "c2248"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2018", - }, - } - - metrics_data = index_docs.convert_raw_results_to_index_documents(data) - month_item = metrics_data["month"][ - "books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" - ] - month_title = metrics_data["month"][ - "title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" - ] - - self.assertEqual(month_item["counter"]["data_type"], "Book_Segment") - self.assertEqual(month_item["counter"]["metric_scope"], "item") - self.assertEqual(month_item["document"]["id"], "BOOK:C2248") - self.assertEqual(month_item["document"]["title"], "C2248 Book") - self.assertNotIn("parent_id", month_item["document"]) - self.assertEqual(month_title["counter"]["data_type"], "Book") - self.assertEqual(month_title["counter"]["metric_scope"], "title") - self.assertEqual(month_title["document"]["id"], "BOOK:C2248") - self.assertEqual(month_title["document"]["title"], "C2248 Book") - - def test_convert_raw_results_aggregates_multiple_chapters_correctly(self): - """Test that accessing multiple chapters creates correct title-level totals""" - data = { - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|session1|BR|en|html|full_text": { - "collection": "books", - "source_key": "q7gtd", - "document_type": "chapter", - "pid_generic": "BOOK:Q7GTD/CHAPTER:01", - "title_pid_generic": "BOOK:Q7GTD", - "user_session_id": "session1", - "click_timestamps": {"00:05": 1}, - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "q7gtd", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "main_title": "Book Title", - "identifiers": {"book_id": "q7gtd"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2023", - }, - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|session1|BR|en|html|full_text": { - "collection": "books", - "source_key": "q7gtd", - "document_type": "chapter", - "pid_generic": "BOOK:Q7GTD/CHAPTER:02", - "title_pid_generic": "BOOK:Q7GTD", - "user_session_id": "session1", # SAME SESSION - "click_timestamps": {"00:10": 1}, - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "q7gtd", - "scielo_issn": DEFAULT_SCIELO_ISSN, - "main_title": "Book Title", - "identifiers": {"book_id": "q7gtd"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2023", - }, - } - - metrics_data = index_docs.convert_raw_results_to_index_documents(data) - - # Should have 2 item documents (one per chapter) + 2 title documents (month and year) - self.assertEqual(len(metrics_data["month"]), 3) # 2 items + 1 title - self.assertEqual(len(metrics_data["year"]), 3) # 2 items + 1 title - - # Each item should have total=1, unique=1 - month_item_1 = metrics_data["month"][ - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|2024-01|Open|Regular|2023" - ] - self.assertEqual(month_item_1["total_requests"], 1) - self.assertEqual(month_item_1["unique_requests"], 1) - - month_item_2 = metrics_data["month"][ - "books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|2024-01|Open|Regular|2023" - ] - self.assertEqual(month_item_2["total_requests"], 1) - self.assertEqual(month_item_2["unique_requests"], 1) - - # Title should have total=2 (sum of both chapters) - # Title unique should be 1 (same session accessed book, counted once) - month_title = metrics_data["month"][ - "title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023" - ] - self.assertEqual(month_title["total_requests"], 2) - self.assertEqual(month_title["total_investigations"], 2) - self.assertEqual(month_title["unique_requests"], 1) - self.assertEqual(month_title["unique_investigations"], 1) - - def test_export_book_r51_monthly_metrics_writes_counter_title_columns(self): - from metrics.management.commands.export_book_r51_monthly_metrics import Command - - command = Command() - monthly_documents = command._build_monthly_documents( - { - "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": { - "collection": "books", - "source_key": "c2248", - "document_type": "chapter", - "pid_v2": None, - "pid_v3": None, - "pid_generic": "BOOK:C2248/CHAPTER:03", - "title_pid_generic": "BOOK:C2248", - "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", - "click_timestamps": {"00:05": 1}, - "access_country_code": "BR", - "content_language": "pt", - "content_type": CONTENT_TYPE_FULL_TEXT, - "access_date": "2024-01-15", - "access_year": "2024", - "source": { - "source_type": "book", - "source_id": "c2248", - "main_title": "C2248 Book", - "identifiers": {"book_id": "c2248"}, - "publisher_name": ["SciELO Books"], - }, - "publication_year": "2018", - } - } - ) - - with TemporaryDirectory() as tmpdir: - title_path = Path(tmpdir) / "title.csv" - command._write_title_csv(title_path, monthly_documents["title"]) - - with title_path.open(newline="") as fh: - reader = csv.DictReader(fh) - rows = list(reader) - - self.assertEqual( - reader.fieldnames, - [ - "year_month", - "title_pid_generic", - "document_type", - "total_item_requests", - "total_item_investigations", - "unique_title_requests", - "unique_title_investigations", - ], - ) - self.assertNotIn("total_title_requests", reader.fieldnames) - self.assertEqual(rows[0]["year_month"], "2024-01") - self.assertEqual(rows[0]["total_item_requests"], "1") - self.assertEqual(rows[0]["unique_title_requests"], "1") diff --git a/metrics/views.py b/metrics/views.py deleted file mode 100755 index 91ea44a..0000000 --- a/metrics/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/metrics/wagtail_hooks.py b/metrics/wagtail_hooks.py index 94c2ffb..82b6d52 100644 --- a/metrics/wagtail_hooks.py +++ b/metrics/wagtail_hooks.py @@ -3,6 +3,7 @@ from metrics.models import DailyMetricJob + class DailyMetricJobSnippetViewSet(SnippetViewSet): model = DailyMetricJob menu_label = _("Daily Metric Jobs") diff --git a/pytest.ini b/pytest.ini index c2b3a23..e59c146 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] addopts = --ds=config.settings.test --reuse-db python_files = tests.py test_*.py +norecursedirs = src diff --git a/reports/models.py b/reports/models.py index 3af1ec8..6311034 100644 --- a/reports/models.py +++ b/reports/models.py @@ -31,6 +31,7 @@ def pct_validated(self): if not self.total_files: return 0 return round(self.validated_files / self.total_files * 100, 1) + pct_validated.fget.short_description = _("% Valid Files") @property @@ -38,6 +39,7 @@ def pct_valid_lines(self): if not self.lines_parsed: return 0 return round(self.valid_lines / self.lines_parsed * 100, 1) + pct_valid_lines.fget.short_description = _("% Valid Lines") @property @@ -46,6 +48,7 @@ def pct_remote_ip(self): if not total: return 0 return round(self.ip_remote_count / total * 100, 1) + pct_remote_ip.fget.short_description = _("% Remote IP") def __str__(self): diff --git a/reports/services/__init__.py b/reports/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reports/services/dates.py b/reports/services/dates.py new file mode 100644 index 0000000..01c425e --- /dev/null +++ b/reports/services/dates.py @@ -0,0 +1,26 @@ +import re + +from core.utils import date_utils + + +def get_report_date_from_log_file(log_file): + if log_file.date: + return log_file.date + + validation_date = (log_file.validation or {}).get("probably_date") + if isinstance(validation_date, str) and validation_date: + return date_utils.get_date_obj(validation_date) + + return _get_report_date_from_log_file_path(log_file.path) + + +def _get_report_date_from_log_file_path(path): + try: + match = re.search(r"(\d{4}-\d{2}-\d{2})", path) + except TypeError: + return None + + if not match: + return None + + return date_utils.get_date_obj(match.group(1)) diff --git a/reports/services/emails.py b/reports/services/emails.py new file mode 100644 index 0000000..9897458 --- /dev/null +++ b/reports/services/emails.py @@ -0,0 +1,164 @@ +import logging + +from django.conf import settings +from django.core.mail import send_mail +from django.utils.translation import gettext as _ + +from collection.models import Collection +from core.utils import date_utils +from log_manager_config import models as lmc_models +from reports.models import MonthlyLogReport + + +def send_log_report_summary_emails( + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, +): + from_date_str, until_date_str = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + subject = _("Usage Log Report Summary " f"({from_date_str} to {until_date_str})") + + for collection_acron in collections or Collection.acron3_list(): + try: + collection = Collection.objects.get(acron3=collection_acron) + except Collection.DoesNotExist: + logging.warning("Collection not found: %s", collection_acron) + continue + + message = _build_collection_log_report_email_message( + collection, + from_date_str, + until_date_str, + ) + + if not message: + continue + + logging.info( + "Sending email to collection %s. Subject: %s.", + collection.main_name, + subject, + ) + + _send_log_report_email_to_collection(subject, message, collection_acron) + + +def _build_collection_log_report_email_message( + collection, + from_date_str, + until_date_str, +): + monthly_reports = _list_latest_monthly_reports(collection) + if not monthly_reports: + return "" + + latest_report = monthly_reports[0] + message = _build_current_month_report_section( + collection, + latest_report, + from_date_str, + until_date_str, + ) + + if len(monthly_reports) > 1: + previous_report = monthly_reports[1] + message += _build_previous_month_report_section(previous_report) + message += _build_month_over_month_report_section( + latest_report, + previous_report, + ) + + message += "\n---\nThis report is automatically generated by SciELO Usage.\n" + return message + + +def _list_latest_monthly_reports(collection): + return list( + MonthlyLogReport.objects.filter( + collection=collection, + ).order_by( + "-year", "-month" + )[:2] + ) + + +def _build_current_month_report_section( + collection, + report, + from_date_str, + until_date_str, +): + message = _( + f"Usage Log Report for {collection.acron3}\n" + f"Period: {from_date_str} to {until_date_str}\n\n" + ) + message += _("Latest month ({latest}):\n").format(latest=report.period_label) + message += ( + f" Total files: {report.total_files}\n" + f" Validated files: {report.validated_files} ({report.pct_validated}%)\n" + f" Invalidated files: {report.invalidated_files}\n" + f" Errored files: {report.errored_files}\n" + f" Lines parsed: {report.lines_parsed}\n" + f" Valid lines: {report.valid_lines} ({report.pct_valid_lines}%)\n" + f" Discarded lines: {report.discarded_lines}\n" + f" Remote IPs: {report.ip_remote_count} ({report.pct_remote_ip}%)\n" + f" Local IPs: {report.ip_local_count}\n" + ) + return message + + +def _build_previous_month_report_section(report): + message = _("\nPrevious month ({prev}):\n").format(prev=report.period_label) + message += ( + f" Total files: {report.total_files}\n" + f" Validated files: {report.validated_files} ({report.pct_validated}%)\n" + f" Valid lines: {report.valid_lines} ({report.pct_valid_lines}%)\n" + f" Remote IPs: {report.ip_remote_count} ({report.pct_remote_ip}%)\n" + ) + return message + + +def _build_month_over_month_report_section(latest_report, previous_report): + if not previous_report.total_files: + return "" + + file_diff = latest_report.total_files - previous_report.total_files + line_diff = latest_report.lines_parsed - previous_report.lines_parsed + + message = _("\nMonth-over-month change:\n") + message += f" Files: {file_diff:+d}\n" + message += f" Lines: {line_diff:+d}\n" + return message + + +def _send_log_report_email_to_collection(subject, message, collection_acron): + emails = lmc_models.CollectionEmail.objects.filter( + config__collection__acron3=collection_acron, + active=True, + ).values_list("email", flat=True) + + if not emails: + logging.error( + "Error. Please, add an E-mail Configuration for the collection %s.", + collection_acron, + ) + return + + try: + send_mail( + subject=subject, + message=message, + from_email=settings.DEFAULT_FROM_EMAIL, + recipient_list=list(emails), + ) + except Exception as e: + logging.error( + "Error sending log files report for %s: %s", + collection_acron, + e, + ) diff --git a/reports/services/log_report.py b/reports/services/log_report.py new file mode 100644 index 0000000..18dad6a --- /dev/null +++ b/reports/services/log_report.py @@ -0,0 +1,164 @@ +import logging +from collections import defaultdict + +from log_manager import choices +from log_manager.models import LogFile +from reports.models import MonthlyLogReport, WeeklyLogReport, YearlyLogReport +from reports.services.dates import get_report_date_from_log_file + +VALIDATED_FILE_STATUSES = { + choices.LOG_FILE_STATUS_QUEUED, + choices.LOG_FILE_STATUS_PARSING, + choices.LOG_FILE_STATUS_PROCESSED, +} + + +def populate_log_report_tables(year=None, collection_acron=None): + totals_by_period = _build_log_report_totals_by_period( + year=year, + collection_acron=collection_acron, + ) + + weekly_count = _upsert_log_report_records( + WeeklyLogReport, + totals_by_period["weekly"], + ) + monthly_count = _upsert_log_report_records( + MonthlyLogReport, + totals_by_period["monthly"], + ) + yearly_count = _upsert_log_report_records( + YearlyLogReport, + totals_by_period["yearly"], + ) + + logging.info( + "Reports populated: %s weekly, %s monthly, %s yearly.", + weekly_count, + monthly_count, + yearly_count, + ) + + return f"Weekly: {weekly_count}, Monthly: {monthly_count}, Yearly: {yearly_count}" + + +def _build_log_report_totals_by_period(year=None, collection_acron=None): + totals_by_period = { + "weekly": defaultdict(lambda: defaultdict(int)), + "monthly": defaultdict(lambda: defaultdict(int)), + "yearly": defaultdict(lambda: defaultdict(int)), + } + + for log_file in _iter_reportable_log_files(collection_acron=collection_acron): + report_date = get_report_date_from_log_file(log_file) + if not report_date: + continue + + if year and report_date.year != int(year): + continue + + _add_log_file_to_period_totals(totals_by_period, log_file, report_date) + + return totals_by_period + + +def _iter_reportable_log_files(collection_acron=None): + queryset = LogFile.objects.select_related("collection") + + if collection_acron: + queryset = queryset.filter(collection__acron3=collection_acron) + + queryset = queryset.only( + "id", + "collection_id", + "date", + "path", + "status", + "summary", + "validation", + ) + + return queryset.iterator(chunk_size=2000) + + +def _add_log_file_to_period_totals(totals_by_period, log_file, report_date): + iso_year, iso_week, _ = report_date.isocalendar() + + period_keys = { + "weekly": (log_file.collection_id, iso_year, iso_week), + "monthly": (log_file.collection_id, report_date.year, report_date.month), + "yearly": (log_file.collection_id, report_date.year), + } + + for period_name, period_key in period_keys.items(): + totals = totals_by_period[period_name][period_key] + _add_log_file_metrics_to_totals(totals, log_file) + + +def _add_log_file_metrics_to_totals(totals, log_file): + totals["total_files"] += 1 + + _add_log_file_status_to_totals(totals, log_file.status) + _add_log_file_line_counts_to_totals(totals, log_file.summary or {}) + _add_log_file_ip_counts_to_totals(totals, log_file.validation or {}) + + +def _add_log_file_status_to_totals(totals, status): + if status == choices.LOG_FILE_STATUS_CREATED: + totals["created_files"] += 1 + return + + if status in VALIDATED_FILE_STATUSES: + totals["validated_files"] += 1 + return + + if status == choices.LOG_FILE_STATUS_INVALIDATED: + totals["invalidated_files"] += 1 + return + + if status == choices.LOG_FILE_STATUS_ERROR: + totals["errored_files"] += 1 + + +def _add_log_file_line_counts_to_totals(totals, summary): + lines_parsed = summary.get("lines_parsed", 0) or 0 + valid_lines = summary.get("valid_lines", 0) or 0 + + totals["lines_parsed"] += lines_parsed + totals["valid_lines"] += valid_lines + totals["discarded_lines"] += max(lines_parsed - valid_lines, 0) + + +def _add_log_file_ip_counts_to_totals(totals, validation): + ip_counts = validation.get("content", {}).get("summary", {}).get("ips", {}) + + totals["ip_local_count"] += ip_counts.get("local", 0) or 0 + totals["ip_remote_count"] += ip_counts.get("remote", 0) or 0 + totals["ip_unknown_count"] += ip_counts.get("unknown", 0) or 0 + + +def _upsert_log_report_records(model_class, totals_by_key): + count = 0 + period_fields = _get_report_model_period_fields(model_class) + + for period_key, totals in totals_by_key.items(): + lookup = _build_log_report_record_lookup(period_fields, period_key) + model_class.objects.update_or_create(defaults=totals, **lookup) + count += 1 + + return count + + +def _get_report_model_period_fields(model_class): + unique_fields = list(model_class._meta.unique_together[0]) + return unique_fields[1:] + + +def _build_log_report_record_lookup(period_fields, period_key): + lookup = {"collection_id": period_key[0]} + period_values = period_key[1:] + + for idx, field_name in enumerate(period_fields): + lookup[field_name] = period_values[idx] + + return lookup diff --git a/reports/tasks.py b/reports/tasks.py index 6a70048..81894e4 100644 --- a/reports/tasks.py +++ b/reports/tasks.py @@ -1,121 +1,13 @@ -import logging -import re -from collections import defaultdict - -from django.core.mail import send_mail -from django.conf import settings -from django.utils.translation import gettext as _ - from config import celery_app -from core.utils import date_utils -from collection.models import Collection -from log_manager import choices -from log_manager.models import LogFile -from log_manager_config import models as lmc_models - -from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport - - -def _extract_date_from_log_file(lf): - if lf.date: - return lf.date - - probably_date = (lf.validation or {}).get("probably_date") - if isinstance(probably_date, str) and probably_date: - return date_utils.get_date_obj(probably_date) - - try: - match = re.search(r"(\d{4}-\d{2}-\d{2})", lf.path) - if match: - return date_utils.get_date_obj(match.group(1)) - except Exception: - pass - - return None +from reports.services import emails, log_report @celery_app.task(bind=True, name="[Reports] Populate All Reports") def task_populate_all_reports(self, year=None, collection_acron=None): - qs = LogFile.objects.select_related("collection") - if collection_acron: - qs = qs.filter(collection__acron3=collection_acron) - qs = qs.only( - "id", "collection_id", "date", "path", "status", "summary", "validation" - ) - - weekly = defaultdict(lambda: defaultdict(int)) - monthly = defaultdict(lambda: defaultdict(int)) - yearly = defaultdict(lambda: defaultdict(int)) - - for lf in qs.iterator(chunk_size=2000): - extracted_date = _extract_date_from_log_file(lf) - if not extracted_date: - continue - if year and extracted_date.year != int(year): - continue - - iso_year, iso_week, _ = extracted_date.isocalendar() - yr = extracted_date.year - mo = extracted_date.month - - for agg, key in [ - (weekly, (lf.collection_id, iso_year, iso_week)), - (monthly, (lf.collection_id, yr, mo)), - (yearly, (lf.collection_id, yr)), - ]: - r = agg[key] - r["total_files"] += 1 - st = lf.status - if st == "CRE": - r["created_files"] += 1 - elif st in ("QUE", "PAR", "PRO"): - r["validated_files"] += 1 - elif st == "INV": - r["invalidated_files"] += 1 - elif st == "ERR": - r["errored_files"] += 1 - - s = lf.summary or {} - lp = s.get("lines_parsed", 0) or 0 - vl = s.get("valid_lines", 0) or 0 - r["lines_parsed"] += lp - r["valid_lines"] += vl - r["discarded_lines"] += max(lp - vl, 0) - - ips = ( - (lf.validation or {}) - .get("content", {}) - .get("summary", {}) - .get("ips", {}) - ) - r["ip_local_count"] += ips.get("local", 0) or 0 - r["ip_remote_count"] += ips.get("remote", 0) or 0 - r["ip_unknown_count"] += ips.get("unknown", 0) or 0 - - w_count = _upsert_reports(WeeklyLogReport, weekly) - m_count = _upsert_reports(MonthlyLogReport, monthly) - y_count = _upsert_reports(YearlyLogReport, yearly) - - logging.info( - "Reports populated: %s weekly, %s monthly, %s yearly.", - w_count, m_count, y_count, + return log_report.populate_log_report_tables( + year=year, + collection_acron=collection_acron, ) - return f"Weekly: {w_count}, Monthly: {m_count}, Yearly: {y_count}" - - -def _upsert_reports(model_class, data): - count = 0 - unique_fields = list(model_class._meta.unique_together[0]) - period_fields = unique_fields[1:] - for key, fields in data.items(): - coll_id = key[0] - period_values = key[1:] - lookup = {"collection_id": coll_id} - for idx, field_name in enumerate(period_fields): - lookup[field_name] = period_values[idx] - model_class.objects.update_or_create(defaults=fields, **lookup) - count += 1 - return count @celery_app.task( @@ -132,107 +24,9 @@ def task_log_files_count_status_report( user_id=None, username=None, ): - from_date_str, until_date_str = date_utils.get_date_range_str( - from_date, until_date, days_to_go_back - ) - subject = _( - "Usage Log Report Summary " - f"({from_date_str} to {until_date_str})" + return emails.send_log_report_summary_emails( + collections=collections, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, ) - - for collection_acron in (collections or Collection.acron3_list()): - try: - collection = Collection.objects.get(acron3=collection_acron) - except Collection.DoesNotExist: - logging.warning("Collection not found: %s", collection_acron) - continue - - message = _build_report_message( - collection, - from_date_str, - until_date_str, - ) - - if not message: - continue - - logging.info( - "Sending email to collection %s. Subject: %s.", - collection.main_name, subject, - ) - - _send_collection_email(subject, message, collection_acron) - - -def _build_report_message(collection, from_date_str, until_date_str): - monthly = MonthlyLogReport.objects.filter( - collection=collection, - ).order_by("-year", "-month") - - if not monthly.exists(): - return "" - - latest = monthly.first() - message = _( - f"Usage Log Report for {collection.acron3}\n" - f"Period: {from_date_str} to {until_date_str}\n\n" - ) - message += _("Latest month ({latest}):\n").format(latest=latest.period_label) - message += ( - f" Total files: {latest.total_files}\n" - f" Validated files: {latest.validated_files} ({latest.pct_validated}%)\n" - f" Invalidated files: {latest.invalidated_files}\n" - f" Errored files: {latest.errored_files}\n" - f" Lines parsed: {latest.lines_parsed}\n" - f" Valid lines: {latest.valid_lines} ({latest.pct_valid_lines}%)\n" - f" Discarded lines: {latest.discarded_lines}\n" - f" Remote IPs: {latest.ip_remote_count} ({latest.pct_remote_ip}%)\n" - f" Local IPs: {latest.ip_local_count}\n" - ) - - prev_month = latest - if len(monthly) > 1: - prev_month = monthly[1] - message += _("\nPrevious month ({prev}):\n").format(prev=prev_month.period_label) - message += ( - f" Total files: {prev_month.total_files}\n" - f" Validated files: {prev_month.validated_files} ({prev_month.pct_validated}%)\n" - f" Valid lines: {prev_month.valid_lines} ({prev_month.pct_valid_lines}%)\n" - f" Remote IPs: {prev_month.ip_remote_count} ({prev_month.pct_remote_ip}%)\n" - ) - - if prev_month.total_files: - file_diff = latest.total_files - prev_month.total_files - line_diff = latest.lines_parsed - prev_month.lines_parsed - message += _("\nMonth-over-month change:\n") - message += f" Files: {file_diff:+d}\n" - message += f" Lines: {line_diff:+d}\n" - - message += ( - f"\n---\n" - f"This report is automatically generated by SciELO Usage.\n" - ) - return message - - -def _send_collection_email(subject, message, collection): - emails = lmc_models.CollectionEmail.objects.filter( - config__collection__acron3=collection, active=True - ).values_list("email", flat=True) - - if not emails: - logging.error( - "Error. Please, add an E-mail Configuration for the collection %s.", - collection, - ) - return - - try: - send_mail( - subject=subject, - message=message, - from_email=settings.DEFAULT_FROM_EMAIL, - recipient_list=list(emails), - ) - except Exception as e: - logging.error("Error sending log files report for %s: %s", collection, e) diff --git a/reports/tests/__init__.py b/reports/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reports/tests/test_services.py b/reports/tests/test_services.py new file mode 100644 index 0000000..34d25d4 --- /dev/null +++ b/reports/tests/test_services.py @@ -0,0 +1,70 @@ +from datetime import date + +from django.test import TestCase + +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from reports.services import dates, log_report + + +class DateServiceTests(TestCase): + def test_get_report_date_from_log_file_uses_validation_probably_date(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + log_file = LogFile( + collection=collection, + path="/tmp/access.log", + stat_result={}, + hash="1" * 32, + status=choices.LOG_FILE_STATUS_CREATED, + validation={"probably_date": "2026-05-10"}, + ) + + self.assertEqual( + dates.get_report_date_from_log_file(log_file), + date(2026, 5, 10), + ) + + +class LogReportServiceTests(TestCase): + def test_populate_log_report_tables_aggregates_log_files(self): + from reports.models import MonthlyLogReport, WeeklyLogReport, YearlyLogReport + + collection = Collection.objects.create(acron3="books", acron2="bk") + LogFile.objects.create( + collection=collection, + path="/tmp/access-1.log", + stat_result={}, + hash="1" * 32, + status=choices.LOG_FILE_STATUS_QUEUED, + date=date(2026, 5, 10), + summary={"lines_parsed": 10, "valid_lines": 7}, + validation={ + "content": { + "summary": { + "ips": {"local": 1, "remote": 2, "unknown": 3}, + } + } + }, + ) + + result = log_report.populate_log_report_tables( + year=2026, + collection_acron="books", + ) + + self.assertEqual(result, "Weekly: 1, Monthly: 1, Yearly: 1") + + weekly = WeeklyLogReport.objects.get(collection=collection) + monthly = MonthlyLogReport.objects.get(collection=collection) + yearly = YearlyLogReport.objects.get(collection=collection) + + for report in [weekly, monthly, yearly]: + self.assertEqual(report.total_files, 1) + self.assertEqual(report.validated_files, 1) + self.assertEqual(report.lines_parsed, 10) + self.assertEqual(report.valid_lines, 7) + self.assertEqual(report.discarded_lines, 3) + self.assertEqual(report.ip_local_count, 1) + self.assertEqual(report.ip_remote_count, 2) + self.assertEqual(report.ip_unknown_count, 3) diff --git a/reports/wagtail_hooks.py b/reports/wagtail_hooks.py index b2aeac7..5e9e76b 100644 --- a/reports/wagtail_hooks.py +++ b/reports/wagtail_hooks.py @@ -1,10 +1,10 @@ from django.contrib.auth import get_user_model from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup -from wagtail.snippets.models import register_snippet from wagtail.permission_policies.base import BasePermissionPolicy +from wagtail.snippets.models import register_snippet +from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup -from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport +from reports.models import MonthlyLogReport, WeeklyLogReport, YearlyLogReport class ReadOnlyPermissionPolicy(BasePermissionPolicy): diff --git a/requirements/base.txt b/requirements/base.txt index 7b5ed61..5186a11 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -63,13 +63,13 @@ minio==7.2.7 reverse-geocode==1.6 # https://pypi.org/project/reverse-geocode/ # SciELO Log Validator --e git+https://github.com/scieloorg/scielo_log_validator@2.0.0#egg=scielo_log_validator +git+https://github.com/scieloorg/scielo_log_validator@2.0.0#egg=scielo_log_validator # SciELO Scholarly Data --e git+https://github.com/scieloorg/scielo_scholarly_data@v0.1.4#egg=scielo_scholarly_data +git+https://github.com/scieloorg/scielo_scholarly_data@v0.1.4#egg=scielo_scholarly_data # SciELO Usage COUNTER --e git+https://github.com/scieloorg/scielo_usage_counter@2.0.0#egg=scielo_usage_counter +git+https://github.com/scieloorg/scielo_usage_counter@2.0.0#egg=scielo_usage_counter # Device Detector device-detector==0.10 # https://github.com/thinkwelltwd/device_detector diff --git a/requirements/production.txt b/requirements/production.txt index 334e9f1..b580f8a 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -4,7 +4,7 @@ gevent==23.9.1 # http://www.gevent.org/ gunicorn==21.2.0 # https://github.com/benoitc/gunicorn -psycopg2-binary==2.9.9 # https://github.com/psycopg/psycopg2 +psycopg2==2.9.9 # https://github.com/psycopg/psycopg2 sentry-sdk==1.39.1 # https://github.com/getsentry/sentry-python # Django diff --git a/resources/admin.py b/resources/admin.py index 8c38f3f..846f6b4 100644 --- a/resources/admin.py +++ b/resources/admin.py @@ -1,3 +1 @@ -from django.contrib import admin - # Register your models here. diff --git a/resources/models.py b/resources/models.py index 22663e2..8021d6f 100644 --- a/resources/models.py +++ b/resources/models.py @@ -4,6 +4,7 @@ from django.utils.translation import gettext_lazy as _ from wagtail.admin.panels import FieldPanel + class RobotUserAgent(models.Model): SOURCE_ALL = "all" SOURCE_COUNTER = "counter" @@ -23,7 +24,7 @@ class RobotUserAgent(models.Model): updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) pattern = models.CharField( - verbose_name=_('Pattern'), + verbose_name=_("Pattern"), max_length=255, null=False, blank=False, @@ -51,7 +52,7 @@ class RobotUserAgent(models.Model): blank=True, ) last_changed = models.DateField( - verbose_name=_('Last Changed'), + verbose_name=_("Last Changed"), null=True, blank=True, ) @@ -101,20 +102,20 @@ class MMDB(models.Model): created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) id = models.CharField( - verbose_name=_('ID (HASH)'), - max_length=64, + verbose_name=_("ID (HASH)"), + max_length=64, primary_key=True, ) data = models.BinaryField( - verbose_name=_('MMDB Data'), + verbose_name=_("MMDB Data"), ) url = models.URLField( - verbose_name=_('URL'), + verbose_name=_("URL"), max_length=255, null=True, blank=True, ) - + def save(self, *args, **kwargs): if self.data: self.id = MMDB.compute_hash(self.data) @@ -125,4 +126,4 @@ def compute_hash(cls, data): return hashlib.sha256(data).hexdigest() def __str__(self): - return f'{self.id}' + return f"{self.id}" diff --git a/resources/services.py b/resources/services.py new file mode 100644 index 0000000..4f27b4f --- /dev/null +++ b/resources/services.py @@ -0,0 +1,119 @@ +import logging + +from django.conf import settings + +from resources import models, utils + + +def load_robots(url_robots=None): + if not url_robots: + url_robots = settings.COUNTER_ROBOTS_URL + logging.warning("No robots URL provided. Using default: %s", url_robots) + + try: + robots_data = utils.fetch_data(url_robots, data_type="json") + except Exception as e: + logging.error("Error downloading robots: %s", e) + return False + + cleaned_robots_data = utils.clean_robots_list(robots_data) + fetched_patterns = set() + + try: + for r_str in cleaned_robots_data: + pattern = r_str.get("pattern") + last_changed = r_str.get("last_changed") + fetched_patterns.add(pattern) + + r_obj = models.RobotUserAgent.objects.filter(pattern=pattern).first() + created = r_obj is None + + if created: + r_obj = models.RobotUserAgent( + pattern=pattern, + source_counter=True, + source_scielo=False, + ) + r_obj.source_counter = True + r_obj.is_active = True + r_obj.source_url = url_robots + r_obj.last_changed = last_changed + + r_obj.save() + logging.debug("Robot saved: %s", r_obj) + + stale_counter_patterns = models.RobotUserAgent.objects.filter( + source_counter=True + ).exclude(pattern__in=fetched_patterns) + + for r_obj in stale_counter_patterns: + r_obj.source_counter = False + r_obj.source_url = None + r_obj.last_changed = None + if not r_obj.source_scielo: + r_obj.is_active = False + r_obj.save() + logging.debug( + "Robot deactivated or detached from COUNTER source: %s", r_obj + ) + + return True + + except Exception as e: + logging.error("Error saving robots: %s", e) + return False + + +def load_geoip(url_geoip=None, validate=True): + if url_geoip: + candidates = [url_geoip] + else: + candidates = utils.resolve_mmdb_url() + logging.info("No GeoIP URL provided. Will try candidates: %s", candidates) + + data = None + resolved_url = None + for url in candidates: + try: + data = utils.fetch_data(url, data_type="content") + resolved_url = url + logging.info("GeoIP data downloaded from: %s", url) + break + except Exception as e: + logging.warning( + "Failed to download GeoIP from %s: %s. Trying next candidate.", url, e + ) + + if data is None: + logging.error( + "Could not download GeoIP data from any candidate URL: %s", candidates + ) + return False + + try: + mmdb_data = utils.decompress_gzip(data) + except Exception as e: + logging.error("Error decompressing GeoIP data: %s", e) + return False + + if validate: + try: + utils.validate_geoip_data(mmdb_data) + except Exception as e: + logging.error("Error validating GeoIP data: %s", e) + return False + + mmdb_hash = models.MMDB.compute_hash(mmdb_data) + + try: + mmdb_obj = models.MMDB.objects.get(id=mmdb_hash) + logging.debug("GeoIP data already exists: %s", mmdb_obj) + + except models.MMDB.DoesNotExist: + mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data) + mmdb_obj.url = resolved_url + + mmdb_obj.save() + logging.info("GeoIP data saved (url=%s, hash=%s)", resolved_url, mmdb_hash) + + return True diff --git a/resources/tasks.py b/resources/tasks.py index 0a87600..ea53c85 100644 --- a/resources/tasks.py +++ b/resources/tasks.py @@ -1,153 +1,12 @@ -import logging - -from django.conf import settings - from config import celery_app - -from . import models, utils +from resources import services -@celery_app.task(bind=True, name='[Resources] Load Robots Data') +@celery_app.task(bind=True, name="[Resources] Load Robots Data") def task_load_robots(self, url_robots=None): - """ - Load robots from a given URL and save them to the database. - This function fetches robot data from a specified URL (or a default URL if none is provided), - cleans the data, and saves it to the database. If the robots already exist in the database, - their information is updated. - Args: - url_robots (str, optional): The URL to fetch the robots data from. Defaults to None. - user_id (int, optional): The ID of the user performing the task. Defaults to None. - username (str, optional): The username of the user performing the task. Defaults to None. - Returns: - bool: True if the robots were successfully loaded and saved, False otherwise. - Raises: - Exception: If there is an error fetching or saving the robots data. - Logs: - - Warning if no robots URL is provided. - - Error if there is an issue downloading or saving the robots. - - Debug information for each robot saved. - """ - if not url_robots: - url_robots = settings.COUNTER_ROBOTS_URL - logging.warning(f'No robots URL provided. Using default: {url_robots}') - - try: - robots_data = utils.fetch_data(url_robots, data_type='json') - except Exception as e: - logging.error(f'Error downloading robots: {e}') - return False - - cleaned_robots_data = utils.clean_robots_list(robots_data) - fetched_patterns = set() - - try: - for r_str in cleaned_robots_data: - pattern = r_str.get('pattern') - last_changed = r_str.get('last_changed') - fetched_patterns.add(pattern) - - r_obj = models.RobotUserAgent.objects.filter(pattern=pattern).first() - created = r_obj is None - - if created: - r_obj = models.RobotUserAgent( - pattern=pattern, - source_counter=True, - source_scielo=False, - ) - r_obj.source_counter = True - r_obj.is_active = True - r_obj.source_url = url_robots - r_obj.last_changed = last_changed - - r_obj.save() - logging.debug(f'Robot saved: {r_obj}') - - stale_counter_patterns = models.RobotUserAgent.objects.filter( - source_counter=True - ).exclude(pattern__in=fetched_patterns) - - for r_obj in stale_counter_patterns: - r_obj.source_counter = False - r_obj.source_url = None - r_obj.last_changed = None - if not r_obj.source_scielo: - r_obj.is_active = False - r_obj.save() - logging.debug(f'Robot deactivated or detached from COUNTER source: {r_obj}') - - return True + return services.load_robots(url_robots=url_robots) - except Exception as e: - logging.error(f'Error saving robots: {e}') - return False - -@celery_app.task(bind=True, name='[Resources] Load Geolocation Data') +@celery_app.task(bind=True, name="[Resources] Load Geolocation Data") def task_load_geoip(self, url_geoip=None, validate=True): - """ - Load GeoIP data from a specified URL, validate it, and save it to the database. - - When ``url_geoip`` is not provided the task resolves the URL automatically: - it tries the current month first and, if the file is not yet available, - falls back to the previous month. - - Args: - url_geoip (str, optional): Explicit URL to download. Defaults to None - (auto-resolved for the current/previous month). - validate (bool, optional): Whether to validate the GeoIP data. Defaults to True. - Returns: - bool: True if the GeoIP data was successfully loaded and saved, False otherwise. - """ - if url_geoip: - candidates = [url_geoip] - else: - candidates = utils.resolve_mmdb_url() - logging.info('No GeoIP URL provided. Will try candidates: %s', candidates) - - data = None - resolved_url = None - for url in candidates: - try: - data = utils.fetch_data(url, data_type='content') - resolved_url = url - logging.info('GeoIP data downloaded from: %s', url) - break - except Exception as e: - logging.warning( - 'Failed to download GeoIP from %s: %s. Trying next candidate.', url, e - ) - - if data is None: - logging.error( - 'Could not download GeoIP data from any candidate URL: %s', candidates - ) - return False - - try: - mmdb_data = utils.decompress_gzip(data) - except Exception as e: - logging.error(f'Error decompressing GeoIP data: {e}') - return False - - if validate: - try: - utils.validate_geoip_data(mmdb_data) - except Exception as e: - logging.error(f'Error validating GeoIP data: {e}') - return False - - mmdb_hash = models.MMDB.compute_hash(mmdb_data) - - try: - mmdb_obj = models.MMDB.objects.get(id=mmdb_hash) - logging.debug(f'GeoIP data already exists: {mmdb_obj}') - - except models.MMDB.DoesNotExist: - mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data) - mmdb_obj.url = resolved_url - - mmdb_obj.save() - logging.info('GeoIP data saved (url=%s, hash=%s)', resolved_url, mmdb_hash) - - return True + return services.load_geoip(url_geoip=url_geoip, validate=validate) diff --git a/resources/tests/__init__.py b/resources/tests/__init__.py index 8b13789..e69de29 100644 --- a/resources/tests/__init__.py +++ b/resources/tests/__init__.py @@ -1 +0,0 @@ - diff --git a/resources/tests/test_mmdb.py b/resources/tests/test_mmdb.py new file mode 100644 index 0000000..8c73038 --- /dev/null +++ b/resources/tests/test_mmdb.py @@ -0,0 +1,22 @@ +from django.test import TestCase + +from resources.models import MMDB + + +class MMDBModelTests(TestCase): + def test_save_computes_sha256_hash_as_pk(self): + data = b"fake mmdb binary data" + mmdb = MMDB(data=data, url="https://example.org/GeoLite2-Country.mmdb") + mmdb.save() + + self.assertEqual(mmdb.pk, MMDB.compute_hash(data)) + self.assertEqual(MMDB.objects.count(), 1) + + def test_different_data_produces_different_hash(self): + mmdb1 = MMDB(data=b"data-v1") + mmdb1.save() + mmdb2 = MMDB(data=b"data-v2") + mmdb2.save() + + self.assertNotEqual(mmdb1.pk, mmdb2.pk) + self.assertEqual(MMDB.objects.count(), 2) diff --git a/resources/tests/test_robots.py b/resources/tests/test_robots.py index 330d4db..847832e 100644 --- a/resources/tests/test_robots.py +++ b/resources/tests/test_robots.py @@ -29,7 +29,9 @@ def test_get_all_patterns_only_returns_active_patterns(self): is_active=False, ) - self.assertListEqual(list(models.RobotUserAgent.get_all_patterns()), [active.pattern]) + self.assertListEqual( + list(models.RobotUserAgent.get_all_patterns()), [active.pattern] + ) def test_get_patterns_can_filter_by_source(self): counter_only = models.RobotUserAgent.objects.create( @@ -66,8 +68,7 @@ def test_get_patterns_rejects_invalid_source(self): class LoadRobotsTaskTests(TestCase): - - @patch("resources.tasks.utils.fetch_data") + @patch("resources.services.utils.fetch_data") @override_settings(COUNTER_ROBOTS_URL="https://settings.example.org/robots.json") def test_task_load_robots_uses_settings_url_when_not_provided( self, @@ -91,7 +92,7 @@ def test_task_load_robots_uses_settings_url_when_not_provided( "https://settings.example.org/robots.json", ) - @patch("resources.tasks.utils.fetch_data") + @patch("resources.services.utils.fetch_data") def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_entries( self, mock_fetch_data, @@ -124,7 +125,9 @@ def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_ent self.assertTrue(counter_bot.source_counter) self.assertFalse(counter_bot.source_scielo) self.assertTrue(counter_bot.is_active) - self.assertEqual(counter_bot.source_url, "https://counter.example.org/robots.json") + self.assertEqual( + counter_bot.source_url, "https://counter.example.org/robots.json" + ) shared_bot.refresh_from_db() self.assertTrue(shared_bot.source_counter) diff --git a/resources/utils.py b/resources/utils.py index c8d58fe..f47edfd 100644 --- a/resources/utils.py +++ b/resources/utils.py @@ -10,7 +10,7 @@ from django.conf import settings -def fetch_data(url, data_type='json', max_retries=5, sleep_time=30): +def fetch_data(url, data_type="json", max_retries=5, sleep_time=30): """ Retrieves data from the given URL. @@ -43,18 +43,14 @@ def fetch_data(url, data_type='json', max_retries=5, sleep_time=30): response.raise_for_status() except requests.exceptions.HTTPError: logging.warning( - 'Failed to retrieve data from %s. Waiting %d seconds before retry %d of %d' % ( - url, - sleep_time, - t, - max_retries - ) + "Failed to retrieve data from %s. Waiting %d seconds before retry %d of %d" + % (url, sleep_time, t, max_retries) ) sleep(sleep_time) else: - if data_type == 'json': + if data_type == "json": return response.json() - elif data_type == 'content': + elif data_type == "content": return response.content else: raise ValueError("Invalid data_type. Expected 'json' or 'content'.") @@ -76,7 +72,7 @@ def clean_robots_list(robots): """ cleaned_robots = [] for r in robots: - if r.get('pattern') and r.get('last_changed'): + if r.get("pattern") and r.get("last_changed"): cleaned_robots.append(r) return cleaned_robots @@ -86,7 +82,7 @@ def decompress_gzip(data): with gzip.GzipFile(fileobj=io.BytesIO(data)) as f: return f.read() except Exception as e: - raise Exception(f'Error decompressing data: {e}') + raise Exception(f"Error decompressing data: {e}") def validate_geoip_data(data): @@ -96,7 +92,7 @@ def validate_geoip_data(data): temp_file.flush() reader = geoip2.database.Reader(temp_file.name) except Exception as e: - raise Exception(f'Error validating GeoIP data: {e}') + raise Exception(f"Error validating GeoIP data: {e}") else: reader.close() return True diff --git a/resources/wagtail_hooks.py b/resources/wagtail_hooks.py index c347b22..808876a 100644 --- a/resources/wagtail_hooks.py +++ b/resources/wagtail_hooks.py @@ -1,10 +1,10 @@ from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup from wagtail.snippets.models import register_snippet +from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup from config.menu import get_menu_order -from .models import (RobotUserAgent, MMDB) +from resources.models import MMDB, RobotUserAgent class RobotUserAgentSnippetViewSet(SnippetViewSet): @@ -52,11 +52,14 @@ class MMDBSnippetViewSet(SnippetViewSet): class ResourcesSnippetViewSetGroup(SnippetViewSetGroup): - menu_name = 'resources' + menu_name = "resources" menu_label = _("Resources") menu_icon = "folder-open-inverse" menu_order = get_menu_order("resources") - items = (RobotUserAgentSnippetViewSet, MMDBSnippetViewSet,) + items = ( + RobotUserAgentSnippetViewSet, + MMDBSnippetViewSet, + ) register_snippet(ResourcesSnippetViewSetGroup) diff --git a/setup.cfg b/setup.cfg index c4ae862..7b8b505 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,7 @@ [flake8] max-line-length = 120 exclude = .tox,.git,*/migrations/*,*/static/CACHE/*,docs,node_modules,venv +ignore = E203, W503 [pycodestyle] max-line-length = 120 diff --git a/source/__init__.py b/source/__init__.py index 8b13789..e69de29 100644 --- a/source/__init__.py +++ b/source/__init__.py @@ -1 +0,0 @@ - diff --git a/source/migrations/__init__.py b/source/migrations/__init__.py index 8b13789..e69de29 100644 --- a/source/migrations/__init__.py +++ b/source/migrations/__init__.py @@ -1 +0,0 @@ - diff --git a/source/models.py b/source/models.py index 48d3e00..c88e9f0 100644 --- a/source/models.py +++ b/source/models.py @@ -1,4 +1,5 @@ from django.db import models +from django.db.models import Q from django.utils.translation import gettext_lazy as _ from collection.models import Collection @@ -144,6 +145,46 @@ class Source(CommonControlField): def __str__(self): return f"{self.collection.acron3} - {self.source_type} - {self.source_id}" + @classmethod + def delete_book_source_by_id(cls, collection, book_id): + return cls.objects.filter( + collection=collection, + source_type=cls.SOURCE_TYPE_BOOK, + source_id=str(book_id), + ).delete() + + @classmethod + def find_journal_by_issns(cls, collection, issns): + for issn in filter(None, issns or []): + source = ( + cls.objects.filter( + collection=collection, + source_type=cls.SOURCE_TYPE_JOURNAL, + ) + .filter( + Q(scielo_issn=issn) + | Q(source_id=issn) + | Q(identifiers__electronic_issn=issn) + | Q(identifiers__print_issn=issn) + | Q(identifiers__scielo_issn=issn) + ) + .first() + ) + if source: + return source + return None + + @classmethod + def find_journal_by_acronym(cls, collection, acronym): + if not acronym: + return None + + return cls.objects.filter( + collection=collection, + source_type=cls.SOURCE_TYPE_JOURNAL, + acronym=acronym, + ).first() + @staticmethod def _extract_issns(identifiers): if not isinstance(identifiers, dict): diff --git a/source/services/__init__.py b/source/services/__init__.py index 8b13789..e69de29 100644 --- a/source/services/__init__.py +++ b/source/services/__init__.py @@ -1 +0,0 @@ - diff --git a/source/services/books.py b/source/services/book.py similarity index 73% rename from source/services/books.py rename to source/services/book.py index df9bd4d..6fc2016 100644 --- a/source/services/books.py +++ b/source/services/book.py @@ -1,14 +1,7 @@ -from collection.models import Collection +from core.utils.metadata import as_list, compact_dict, normalize_year from source.models import Source -BOOKS_COLLECTION_ACRONYM = "books" - - -def get_books_collection(acronym=BOOKS_COLLECTION_ACRONYM): - return Collection.objects.get(acron3=acronym) - - def upsert_monograph_source( payload, collection, @@ -34,12 +27,12 @@ def upsert_monograph_source( source.acronym = "" source.title = payload.get("title") or str(payload.get("id")) source.identifiers = _build_source_identifiers(payload) - source.publisher_name = _as_list(payload.get("publisher")) + source.publisher_name = as_list(payload.get("publisher")) source.subject_areas = [] source.wos_subject_areas = [] source.default_lang = payload.get("language") or None source.publication_date = payload.get("publication_date") or None - source.publication_year = _normalize_year(payload.get("year")) + source.publication_year = normalize_year(payload.get("year")) source.access_type = _normalize_access_type(payload.get("is_comercial")) source.extra_data = _build_source_extra_data( payload, @@ -54,14 +47,6 @@ def upsert_monograph_source( return source -def delete_book_source(collection, book_id): - return Source.objects.filter( - collection=collection, - source_type=Source.SOURCE_TYPE_BOOK, - source_id=str(book_id), - ).delete() - - def _build_source_identifiers(payload): identifiers = { "book_id": str(payload.get("id")) if payload.get("id") is not None else None, @@ -69,7 +54,7 @@ def _build_source_identifiers(payload): "eisbn": payload.get("eisbn"), "doi": payload.get("doi_number"), } - return _compact_dict(identifiers) + return compact_dict(identifiers) def _build_source_extra_data(payload, source_url=None, last_seq=None): @@ -96,23 +81,7 @@ def _build_source_extra_data(payload, source_url=None, last_seq=None): "primary_descriptor": payload.get("primary_descriptor"), "translated_primary_descriptors": payload.get("translated_primary_descriptors"), } - return _compact_dict(extra_data) - - -def _as_list(value): - if not value: - return [] - - if isinstance(value, list): - return value - - return [value] - - -def _normalize_year(value): - if value in (None, ""): - return None - return str(value)[:4] + return compact_dict(extra_data) def _normalize_access_type(value): @@ -126,12 +95,6 @@ def _normalize_access_type(value): if normalized in {"false", "0", "no", "n", "nao", "não"}: return Source.ACCESS_TYPE_OPEN_ACCESS - return Source.ACCESS_TYPE_COMMERCIAL if bool(value) else Source.ACCESS_TYPE_OPEN_ACCESS - - -def _compact_dict(data): - return { - key: value - for key, value in data.items() - if value not in (None, "", [], {}, ()) - } + return ( + Source.ACCESS_TYPE_COMMERCIAL if bool(value) else Source.ACCESS_TYPE_OPEN_ACCESS + ) diff --git a/source/services/journal.py b/source/services/journal.py new file mode 100644 index 0000000..273c912 --- /dev/null +++ b/source/services/journal.py @@ -0,0 +1,56 @@ +from core.utils.metadata import as_list, compact_dict, get_value +from source.models import Source + + +def upsert_journal_source( + journal, + collection, + user=None, + force_update=True, + load_mode=None, +): + scielo_issn = get_value(journal, "scielo_issn") + if not scielo_issn: + return None + + source, created = Source.objects.get_or_create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id=scielo_issn, + ) + + if created and user: + source.creator = user + + if created or force_update: + source.scielo_issn = scielo_issn + source.acronym = get_value(journal, "acronym") or "" + source.title = get_value(journal, "title") or scielo_issn + source.identifiers = _build_source_identifiers(journal) + source.publisher_name = as_list(get_value(journal, "publisher_name")) + source.subject_areas = as_list(get_value(journal, "subject_areas")) + source.wos_subject_areas = as_list(get_value(journal, "wos_subject_areas")) + source.default_lang = None + source.publication_date = None + source.publication_year = None + source.extra_data = compact_dict( + { + "collection_acronym": get_value(journal, "collection_acronym"), + "load_mode": load_mode, + } + ) + + if user: + source.updated_by = user + + source.save() + return source + + +def _build_source_identifiers(journal): + identifiers = { + "electronic_issn": get_value(journal, "electronic_issn"), + "print_issn": get_value(journal, "print_issn"), + "scielo_issn": get_value(journal, "scielo_issn"), + } + return compact_dict(identifiers) diff --git a/source/services/journals.py b/source/services/journals.py deleted file mode 100644 index ac133f6..0000000 --- a/source/services/journals.py +++ /dev/null @@ -1,118 +0,0 @@ -from django.db.models import Q - -from collection.models import Collection -from source.models import Source - - -def get_collection(acronym): - return Collection.objects.filter(acron3=acronym).first() - - -def upsert_journal_source( - journal, - collection, - user=None, - force_update=True, - load_mode=None, -): - scielo_issn = _value(journal, "scielo_issn") - if not scielo_issn: - return None - - source, created = Source.objects.get_or_create( - collection=collection, - source_type=Source.SOURCE_TYPE_JOURNAL, - source_id=scielo_issn, - ) - - if created and user: - source.creator = user - - if created or force_update: - source.scielo_issn = scielo_issn - source.acronym = _value(journal, "acronym") or "" - source.title = _value(journal, "title") or scielo_issn - source.identifiers = _build_source_identifiers(journal) - source.publisher_name = _as_list(_value(journal, "publisher_name")) - source.subject_areas = _as_list(_value(journal, "subject_areas")) - source.wos_subject_areas = _as_list(_value(journal, "wos_subject_areas")) - source.default_lang = None - source.publication_date = None - source.publication_year = None - source.extra_data = _compact_dict( - { - "collection_acronym": _value(journal, "collection_acronym"), - "load_mode": load_mode, - } - ) - - if user: - source.updated_by = user - - source.save() - return source - - -def find_journal_source_by_issns(collection, issns): - for issn in filter(None, issns or []): - source = ( - Source.objects.filter( - collection=collection, - source_type=Source.SOURCE_TYPE_JOURNAL, - ) - .filter( - Q(scielo_issn=issn) - | Q(source_id=issn) - | Q(identifiers__electronic_issn=issn) - | Q(identifiers__print_issn=issn) - | Q(identifiers__scielo_issn=issn) - ) - .first() - ) - if source: - return source - return None - - -def find_journal_source_by_acronym(collection, acronym): - if not acronym: - return None - - return Source.objects.filter( - collection=collection, - source_type=Source.SOURCE_TYPE_JOURNAL, - acronym=acronym, - ).first() - - -def _build_source_identifiers(journal): - identifiers = { - "electronic_issn": _value(journal, "electronic_issn"), - "print_issn": _value(journal, "print_issn"), - "scielo_issn": _value(journal, "scielo_issn"), - } - return _compact_dict(identifiers) - - -def _as_list(value): - if not value: - return [] - - if isinstance(value, list): - return value - - return [value] - - -def _value(data, key, default=None): - if isinstance(data, dict): - return data.get(key, default) - return getattr(data, key, default) - - -def _compact_dict(data): - return { - key: value - for key, value in data.items() - if value not in (None, "", [], {}, ()) - } diff --git a/source/services/loaders.py b/source/services/loaders.py new file mode 100644 index 0000000..4c99238 --- /dev/null +++ b/source/services/loaders.py @@ -0,0 +1,104 @@ +import logging + +from django.conf import settings + +from collection.models import Collection +from core.collectors import articlemeta as articlemeta_collector +from core.collectors import scielo_books as scielo_books_collector +from source.models import Source +from source.services import book as books_service +from source.services import journal as journal_service + + +def load_sources_from_article_meta( + collections=None, + force_update=True, + user=None, + mode="thrift", +): + collection_codes = collections or Collection.acron3_list() + + for collection_code in collection_codes: + logging.info( + "Loading sources from Article Meta. Collection: %s, Mode: %s", + collection_code, + mode, + ) + + for journal in articlemeta_collector.iter_journals( + collection=collection_code, + mode=mode, + ): + collection = Collection.objects.filter( + acron3=journal.collection_acronym + ).first() + if not collection: + logging.error( + "Collection %s does not exist", + journal.collection_acronym, + ) + continue + + source = journal_service.upsert_journal_source( + journal, + collection=collection, + user=user, + force_update=force_update, + load_mode=mode, + ) + logging.info( + "Source %s upserted for collection %s", + source.source_id if source else None, + collection.acron3, + ) + + return True + + +def load_sources_from_scielo_books( + collection="books", + db_name=settings.SCIELO_BOOKS_DB_NAME, + since=0, + limit=settings.SCIELO_BOOKS_LIMIT, + force_update=True, + headers=None, + base_url=None, + user=None, +): + collection_obj = Collection.objects.get(acron3=collection) + + logging.info( + "Loading sources from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s", + collection, + db_name, + since, + limit, + ) + + for item in scielo_books_collector.iter_change_documents( + base_url=base_url, + db_name=db_name, + since=since, + limit=limit, + headers=headers, + ): + change = item["change"] + + if item["deleted"]: + Source.delete_book_source_by_id(collection_obj, change.get("id")) + continue + + payload = item["payload"] or {} + if payload.get("TYPE") != "Monograph": + continue + + books_service.upsert_monograph_source( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=item.get("source_url"), + last_seq=change.get("seq"), + ) + + return True diff --git a/source/tasks.py b/source/tasks.py index 6b7eeb2..9ff1c74 100644 --- a/source/tasks.py +++ b/source/tasks.py @@ -1,106 +1,8 @@ -import logging - from django.conf import settings -from collection.models import Collection from config import celery_app -from core.collectors import articlemeta as articlemeta_collector -from core.collectors import scielo_books as scielo_books_collector from core.utils.request_utils import _get_user -from source.services import books as books_service -from source.services import journals as journal_service - - -def load_sources_from_article_meta( - collections=None, - force_update=True, - user=None, - mode="thrift", -): - collection_codes = collections or Collection.acron3_list() - - for collection_code in collection_codes: - logging.info( - "Loading sources from Article Meta. Collection: %s, Mode: %s", - collection_code, - mode, - ) - - for journal in articlemeta_collector.iter_journals( - collection=collection_code, - mode=mode, - ): - collection = journal_service.get_collection(journal.collection_acronym) - if not collection: - logging.error( - "Collection %s does not exist", - journal.collection_acronym, - ) - continue - - source = journal_service.upsert_journal_source( - journal, - collection=collection, - user=user, - force_update=force_update, - load_mode=mode, - ) - logging.info( - "Source %s upserted for collection %s", - source.source_id if source else None, - collection.acron3, - ) - - return True - - -def load_sources_from_scielo_books( - collection="books", - db_name=settings.SCIELO_BOOKS_DB_NAME, - since=0, - limit=settings.SCIELO_BOOKS_LIMIT, - force_update=True, - headers=None, - base_url=None, - user=None, -): - collection_obj = books_service.get_books_collection(collection) - - logging.info( - "Loading sources from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s", - collection, - db_name, - since, - limit, - ) - - for item in scielo_books_collector.iter_change_documents( - base_url=base_url, - db_name=db_name, - since=since, - limit=limit, - headers=headers, - ): - change = item["change"] - - if item["deleted"]: - books_service.delete_book_source(collection_obj, change.get("id")) - continue - - payload = item["payload"] or {} - if payload.get("TYPE") != "Monograph": - continue - - books_service.upsert_monograph_source( - payload, - collection=collection_obj, - user=user, - force_update=force_update, - source_url=item.get("source_url"), - last_seq=change.get("seq"), - ) - - return True +from source.services import loaders @celery_app.task(bind=True, name="[Metadata] Sync Sources (Article Meta)", queue="load") @@ -113,7 +15,7 @@ def task_load_sources_from_article_meta( mode="thrift", ): user = _get_user(self.request, username=username, user_id=user_id) - return load_sources_from_article_meta( + return loaders.load_sources_from_article_meta( collections=collections, force_update=force_update, user=user, @@ -135,7 +37,7 @@ def task_load_sources_from_scielo_books( username=None, ): user = _get_user(self.request, username=username, user_id=user_id) - return load_sources_from_scielo_books( + return loaders.load_sources_from_scielo_books( collection=collection, db_name=db_name, since=since, diff --git a/source/tests/__init__.py b/source/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/source/tests/test_models.py b/source/tests/test_models.py new file mode 100644 index 0000000..d1206a9 --- /dev/null +++ b/source/tests/test_models.py @@ -0,0 +1,66 @@ +from django.test import TestCase + +from collection.models import Collection +from source.models import Source + + +class SourceLookupTests(TestCase): + def test_find_journal_by_issns_searches_source_and_identifier_fields(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + acronym="testjou", + title="Test Journal", + identifiers={ + "electronic_issn": "2345-6789", + "print_issn": "8765-4321", + "scielo_issn": "3456-7890", + }, + ) + + for issn in ("1234-5678", "2345-6789", "8765-4321", "3456-7890"): + self.assertEqual( + Source.find_journal_by_issns(collection, [issn]), + source, + ) + + self.assertIsNone(Source.find_journal_by_issns(collection, ["0000-0000"])) + + def test_find_journal_by_acronym(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + acronym="testjou", + title="Test Journal", + ) + + self.assertEqual(Source.find_journal_by_acronym(collection, "testjou"), source) + self.assertIsNone(Source.find_journal_by_acronym(collection, "missing")) + self.assertIsNone(Source.find_journal_by_acronym(collection, "")) + + def test_delete_book_source_by_id(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + other_collection = Collection.objects.create(acron3="other", acron2="ot") + Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="abcd1", + title="Book", + ) + Source.objects.create( + collection=other_collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="abcd1", + title="Book", + ) + + deleted_count, _ = Source.delete_book_source_by_id(collection, "abcd1") + + self.assertEqual(deleted_count, 1) + self.assertFalse(Source.objects.filter(collection=collection).exists()) + self.assertTrue(Source.objects.filter(collection=other_collection).exists()) diff --git a/source/tests.py b/source/tests/test_services.py similarity index 68% rename from source/tests.py rename to source/tests/test_services.py index a182f4e..4b72a08 100644 --- a/source/tests.py +++ b/source/tests/test_services.py @@ -1,23 +1,12 @@ from django.test import TestCase from collection.models import Collection - -from .models import Source -from .services import books as books_service -from .services import journals as journal_service +from source.models import Source +from source.services import book as books_service +from source.services import journal as journal_service class SourceMetadataTests(TestCase): - def test_source_type_choices_include_scielo_non_journal_sources(self): - self.assertIn( - (Source.SOURCE_TYPE_PREPRINT_SERVER, "Preprint Server"), - [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES], - ) - self.assertIn( - (Source.SOURCE_TYPE_DATA_REPOSITORY, "Data Repository"), - [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES], - ) - def test_metadata_exposes_generic_and_journal_fields(self): collection = Collection.objects.create(acron3="scl", acron2="sc") Source.objects.create( @@ -50,6 +39,8 @@ def test_metadata_exposes_generic_and_journal_fields(self): self.assertEqual(metadata[0]["issns"], {"1234-5678", "8765-4321"}) self.assertEqual(metadata[0]["title"], "Test Journal") + +class BookSourceServiceTests(TestCase): def test_upsert_monograph_source_maps_scielo_books_payload(self): collection = Collection.objects.create(acron3="books", acron2="bk") @@ -78,27 +69,8 @@ def test_upsert_monograph_source_maps_scielo_books_payload(self): self.assertEqual(source.publication_year, "2024") self.assertEqual(source.access_type, Source.ACCESS_TYPE_OPEN_ACCESS) - def test_upsert_monograph_source_accepts_long_real_world_title(self): - collection = Collection.objects.create(acron3="books", acron2="bk") - title = ( - "O Estado da Arte sobre Refugiados, Deslocados Internos, " - "Deslocados Ambientais e Apatridas no Brasil: atualizacao do " - "Diretorio Nacional do ACNUR de teses, dissertacoes, trabalhos " - "de conclusao de curso de graduacao em Joao Pessoa (Paraiba) e " - "artigos (2007 a 2017)" - ) - - source = books_service.upsert_monograph_source( - { - "TYPE": "Monograph", - "id": "9zzts", - "title": title, - }, - collection=collection, - ) - - self.assertEqual(source.title, title) +class JournalSourceServiceTests(TestCase): def test_upsert_journal_source_maps_articlemeta_payload(self): collection = Collection.objects.create(acron3="scl", acron2="sc") @@ -123,11 +95,3 @@ def test_upsert_journal_source_maps_articlemeta_payload(self): self.assertEqual(source.identifiers["electronic_issn"], "1234-5678") self.assertEqual(source.publisher_name, ["SciELO"]) self.assertEqual(source.extra_data["load_mode"], "thrift") - self.assertEqual( - journal_service.find_journal_source_by_issns(collection, ["8765-4321"]).pk, - source.pk, - ) - self.assertEqual( - journal_service.find_journal_source_by_acronym(collection, "testjou").pk, - source.pk, - ) diff --git a/source/wagtail_hooks.py b/source/wagtail_hooks.py index 5ffad62..4d62334 100644 --- a/source/wagtail_hooks.py +++ b/source/wagtail_hooks.py @@ -1,7 +1,7 @@ from django.utils.translation import gettext_lazy as _ from wagtail.snippets.views.snippets import SnippetViewSet -from .models import Source +from source.models import Source class SourceSnippetViewSet(SnippetViewSet): diff --git a/tracker/choices.py b/tracker/choices.py index dfc562c..81abcb5 100644 --- a/tracker/choices.py +++ b/tracker/choices.py @@ -1,11 +1,10 @@ from django.utils.translation import gettext_lazy as _ - -LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA = 'MET' -LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT = 'DOC' -LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE = 'SRC' -LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION = 'URL' -LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR = 'DBE' +LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA = "MET" +LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT = "DOC" +LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE = "SRC" +LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION = "URL" +LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR = "DBE" LOG_FILE_DISCARDED_LINE_REASON = [ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA, _("Missing Metadata")), diff --git a/tracker/models.py b/tracker/models.py index a394ed6..0654c31 100644 --- a/tracker/models.py +++ b/tracker/models.py @@ -3,15 +3,15 @@ from log_manager.models import LogFile from tracker import choices -from .exceptions import LogFileDiscardedLineCreateError +from tracker.exceptions import LogFileDiscardedLineCreateError class LogFileDiscardedLine(models.Model): created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) log_file = models.ForeignKey( - LogFile, - on_delete=models.CASCADE, - null=False, + LogFile, + on_delete=models.CASCADE, + null=False, blank=False, db_index=True, ) @@ -31,10 +31,7 @@ class LogFileDiscardedLine(models.Model): null=True, blank=True, ) - handled = models.BooleanField( - _("Handled"), - default=False - ) + handled = models.BooleanField(_("Handled"), default=False) @classmethod def create(cls, log_file, error_type, data, message, save=False): @@ -54,6 +51,3 @@ def create(cls, log_file, error_type, data, message, save=False): def __str__(self): return f"{self.data} - {self.message}" - - - diff --git a/tracker/wagtail_hooks.py b/tracker/wagtail_hooks.py index 1ceb9c7..f62f63f 100644 --- a/tracker/wagtail_hooks.py +++ b/tracker/wagtail_hooks.py @@ -1,16 +1,16 @@ from django.utils.translation import gettext as _ -from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup from wagtail.snippets.models import register_snippet +from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup from config.menu import get_menu_order -from .models import LogFileDiscardedLine +from tracker.models import LogFileDiscardedLine class LogFileDiscardedLineSnippetViewSet(SnippetViewSet): model = LogFileDiscardedLine menu_label = _("Discarded Lines") - icon = 'warning' + icon = "warning" menu_order = get_menu_order("tracker") add_to_admin_menu = False @@ -20,12 +20,7 @@ class LogFileDiscardedLineSnippetViewSet(SnippetViewSet): "message", "handled", ) - list_filter = ( - "log_file__collection", - "log_file", - "handled", - "error_type" - ) + list_filter = ("log_file__collection", "log_file", "handled", "error_type") search_fields = ( "data", "message", @@ -39,17 +34,13 @@ class LogFileDiscardedLineSnippetViewSet(SnippetViewSet): ) - - class TrackerSnippetViewSetGroup(SnippetViewSetGroup): - menu_name = 'tracker' + menu_name = "tracker" menu_label = _("Tracker") icon = "folder-open-inverse" menu_order = get_menu_order("tracker") - - items = ( - LogFileDiscardedLineSnippetViewSet, - ) + + items = (LogFileDiscardedLineSnippetViewSet,) register_snippet(TrackerSnippetViewSetGroup)