Initial commit: add .gitignore and README
This commit is contained in:
112
data-storage/data-catalog/README.md
Normal file
112
data-storage/data-catalog/README.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Data Catalog
|
||||
|
||||
**Purpose**: Unified data catalog for tracking and discovering datasets
|
||||
**Status**: 🚧 Planned
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The data catalog provides a centralized registry for all datasets across the workspace, enabling discovery, access control, and metadata management.
|
||||
|
||||
---
|
||||
|
||||
## Features
|
||||
|
||||
- Dataset registration
|
||||
- Metadata management
|
||||
- Search and discovery
|
||||
- Access control
|
||||
- Schema tracking
|
||||
- Lineage tracking
|
||||
|
||||
---
|
||||
|
||||
## Schema
|
||||
|
||||
See `metadata-schema.json` for the complete metadata schema.
|
||||
|
||||
### Key Fields
|
||||
|
||||
- **id**: Unique dataset identifier
|
||||
- **name**: Human-readable name
|
||||
- **source**: Source system/project
|
||||
- **storage**: Storage location details
|
||||
- **schema**: Data schema definition
|
||||
- **tags**: Categorization tags
|
||||
- **access**: Access control settings
|
||||
|
||||
---
|
||||
|
||||
## Implementation Options
|
||||
|
||||
### Option 1: Custom API
|
||||
- Build custom API using shared packages
|
||||
- Use PostgreSQL for metadata storage
|
||||
- Implement search using PostgreSQL full-text search
|
||||
|
||||
### Option 2: DataHub
|
||||
- Deploy DataHub (open-source)
|
||||
- Use existing metadata models
|
||||
- Leverage built-in features
|
||||
|
||||
### Option 3: Amundsen
|
||||
- Deploy Amundsen (open-source)
|
||||
- Use existing metadata models
|
||||
- Leverage built-in features
|
||||
|
||||
---
|
||||
|
||||
## Usage
|
||||
|
||||
### Register Dataset
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "user-events-2025",
|
||||
"name": "User Events 2025",
|
||||
"description": "User interaction events for 2025",
|
||||
"source": "analytics-service",
|
||||
"storage": {
|
||||
"type": "minio",
|
||||
"bucket": "analytics",
|
||||
"path": "events/2025/"
|
||||
},
|
||||
"format": "parquet",
|
||||
"tags": ["events", "analytics", "2025"],
|
||||
"owner": "analytics-team",
|
||||
"access": {
|
||||
"level": "internal",
|
||||
"permissions": ["read"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Search Datasets
|
||||
|
||||
```bash
|
||||
# Search by tag
|
||||
GET /api/catalog/datasets?tag=analytics
|
||||
|
||||
# Search by source
|
||||
GET /api/catalog/datasets?source=analytics-service
|
||||
|
||||
# Full-text search
|
||||
GET /api/catalog/datasets?q=user+events
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Choose implementation option
|
||||
2. Set up metadata storage
|
||||
3. Implement registration API
|
||||
4. Implement search functionality
|
||||
5. Set up access control
|
||||
6. Integrate with projects
|
||||
|
||||
---
|
||||
|
||||
**Status**: 🚧 Planned - Schema and design complete, implementation pending
|
||||
|
||||
92
data-storage/data-catalog/metadata-schema.json
Normal file
92
data-storage/data-catalog/metadata-schema.json
Normal file
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"title": "Data Catalog Metadata Schema",
|
||||
"description": "Schema for data catalog metadata",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier for the dataset"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Human-readable name of the dataset"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Description of the dataset"
|
||||
},
|
||||
"source": {
|
||||
"type": "string",
|
||||
"description": "Source system or project"
|
||||
},
|
||||
"storage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["s3", "minio", "azure-blob", "gcs"],
|
||||
"description": "Storage type"
|
||||
},
|
||||
"bucket": {
|
||||
"type": "string",
|
||||
"description": "Bucket or container name"
|
||||
},
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Path within bucket"
|
||||
}
|
||||
},
|
||||
"required": ["type", "bucket"]
|
||||
},
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"description": "Data schema definition"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["parquet", "json", "csv", "avro"],
|
||||
"description": "Data format"
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags for categorization"
|
||||
},
|
||||
"owner": {
|
||||
"type": "string",
|
||||
"description": "Owner or team responsible"
|
||||
},
|
||||
"created": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Creation timestamp"
|
||||
},
|
||||
"updated": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Last update timestamp"
|
||||
},
|
||||
"access": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"level": {
|
||||
"type": "string",
|
||||
"enum": ["public", "internal", "restricted"],
|
||||
"description": "Access level"
|
||||
},
|
||||
"permissions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["read", "write", "delete"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["id", "name", "source", "storage"]
|
||||
}
|
||||
|
||||
100
data-storage/minio/k8s-deployment.yaml
Normal file
100
data-storage/minio/k8s-deployment.yaml
Normal file
@@ -0,0 +1,100 @@
|
||||
# MinIO Object Storage Kubernetes Deployment
|
||||
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: data-storage
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: minio-secret
|
||||
namespace: data-storage
|
||||
type: Opaque
|
||||
stringData:
|
||||
MINIO_ROOT_USER: minioadmin
|
||||
MINIO_ROOT_PASSWORD: change-me-in-production
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: minio
|
||||
namespace: data-storage
|
||||
spec:
|
||||
serviceName: minio
|
||||
replicas: 4
|
||||
selector:
|
||||
matchLabels:
|
||||
app: minio
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: minio
|
||||
spec:
|
||||
containers:
|
||||
- name: minio
|
||||
image: minio/minio:latest
|
||||
args:
|
||||
- server
|
||||
- /data
|
||||
- --console-address
|
||||
- ":9001"
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: minio-secret
|
||||
ports:
|
||||
- containerPort: 9000
|
||||
name: api
|
||||
- containerPort: 9001
|
||||
name: console
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
storageClassName: standard
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: minio
|
||||
namespace: data-storage
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- port: 9000
|
||||
targetPort: 9000
|
||||
name: api
|
||||
- port: 9001
|
||||
targetPort: 9001
|
||||
name: console
|
||||
selector:
|
||||
app: minio
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: minio-console
|
||||
namespace: data-storage
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
ports:
|
||||
- port: 9001
|
||||
targetPort: 9001
|
||||
name: console
|
||||
selector:
|
||||
app: minio
|
||||
|
||||
Reference in New Issue
Block a user