Test Preparation

This commit is contained in:
Koncept Kit
2025-12-26 20:03:53 +07:00
parent fad23c6e57
commit 487481b322
10 changed files with 1357 additions and 9 deletions

531
wordpress_parser.py Normal file
View File

@@ -0,0 +1,531 @@
"""
WordPress CSV Parser Module
This module provides utilities for parsing WordPress user export CSV files
and transforming them into LOAF platform-compatible data structures.
Key Features:
- Parse PHP serialized data (WordPress capabilities)
- Map WordPress roles to LOAF roles and statuses
- Validate and standardize user data (DOB, phone numbers)
- Generate smart status suggestions based on approval and subscription data
- Comprehensive data quality analysis and error reporting
Author: Claude Code
Date: 2025-12-24
"""
import csv
import re
import logging
from datetime import datetime
from typing import Dict, List, Optional, Tuple
import phpserialize
logger = logging.getLogger(__name__)
# ============================================================================
# WordPress Role Mapping Configuration
# ============================================================================
ROLE_MAPPING = {
# WordPress admin roles → LOAF admin roles (auto-active)
'administrator': ('superadmin', 'active'),
'loaf_admin': ('admin', 'active'),
'loaf_treasure': ('finance', 'active'),
'loaf_communication': ('admin', 'active'),
# WordPress member roles → LOAF member role (status from approval)
'pms_subscription_plan_63': ('member', None), # Status determined by approval
'registered': ('guest', None), # Default WordPress role
# Fallback for unknown roles
'__default__': ('guest', None)
}
# Role priority order (higher index = higher priority)
ROLE_PRIORITY = [
'registered',
'pms_subscription_plan_63',
'loaf_communication',
'loaf_treasure',
'loaf_admin',
'administrator'
]
# ============================================================================
# PHP Serialization Parsing
# ============================================================================
def parse_php_serialized(data: str) -> List[str]:
"""
Parse WordPress PHP serialized capabilities string.
WordPress stores user capabilities as serialized PHP arrays like:
a:1:{s:10:"registered";b:1;}
a:2:{s:10:"registered";b:1;s:24:"pms_subscription_plan_63";b:1;}
Args:
data: PHP serialized string
Returns:
List of role names (e.g., ['registered', 'pms_subscription_plan_63'])
Examples:
>>> parse_php_serialized('a:1:{s:10:"registered";b:1;}')
['registered']
>>> parse_php_serialized('a:2:{s:10:"registered";b:1;s:24:"pms_subscription_plan_63";b:1;}')
['registered', 'pms_subscription_plan_63']
"""
if not data or pd.isna(data):
return []
try:
# Use phpserialize library to parse
parsed = phpserialize.loads(data.encode('utf-8'))
# Extract role names (keys where value is True)
if isinstance(parsed, dict):
roles = [key.decode('utf-8') if isinstance(key, bytes) else key
for key, value in parsed.items() if value]
return roles
return []
except Exception as e:
logger.warning(f"Failed to parse PHP serialized data: {data[:50]}... Error: {str(e)}")
return []
# ============================================================================
# Role and Status Mapping
# ============================================================================
def map_wordpress_role(wp_roles: List[str]) -> Tuple[str, Optional[str]]:
"""
Map WordPress roles to LOAF role and suggested status.
Priority logic:
1. If user has any admin role → corresponding LOAF admin role with 'active' status
2. If user has subscription → 'member' role (status from approval)
3. Otherwise → 'guest' role (status from approval)
Args:
wp_roles: List of WordPress role names
Returns:
Tuple of (loaf_role, suggested_status)
- loaf_role: One of: superadmin, admin, finance, member, guest
- suggested_status: One of: active, pre_validated, payment_pending, None (determined by approval)
Examples:
>>> map_wordpress_role(['loaf_admin'])
('admin', 'active')
>>> map_wordpress_role(['loaf_treasure'])
('finance', 'active')
>>> map_wordpress_role(['pms_subscription_plan_63', 'registered'])
('member', None)
>>> map_wordpress_role(['registered'])
('guest', None)
"""
if not wp_roles:
return ROLE_MAPPING['__default__']
# Sort roles by priority (highest priority last)
prioritized_roles = sorted(
wp_roles,
key=lambda r: ROLE_PRIORITY.index(r) if r in ROLE_PRIORITY else -1
)
# Map highest priority role
highest_role = prioritized_roles[-1] if prioritized_roles else 'registered'
return ROLE_MAPPING.get(highest_role, ROLE_MAPPING['__default__'])
def suggest_status(approval_status: str, has_subscription: bool, wordpress_role: str = 'guest') -> str:
"""
Suggest LOAF user status based on WordPress approval and subscription data.
Logic:
1. Admin roles (loaf_admin, loaf_treasure, administrator) → always 'active'
2. approved + subscription → 'active'
3. approved without subscription → 'pre_validated'
4. pending → 'payment_pending'
5. Other/empty → 'pre_validated'
Args:
approval_status: WordPress approval status (approved, pending, unapproved, etc.)
has_subscription: Whether user has pms_subscription_plan_63 role
wordpress_role: LOAF role mapped from WordPress (for admin check)
Returns:
Suggested LOAF status: active, pre_validated, payment_pending, or inactive
Examples:
>>> suggest_status('approved', True, 'member')
'active'
>>> suggest_status('approved', False, 'member')
'pre_validated'
>>> suggest_status('pending', True, 'member')
'payment_pending'
>>> suggest_status('', False, 'admin')
'active'
"""
# Admin roles are always active
if wordpress_role in ('superadmin', 'admin', 'finance'):
return 'active'
# Normalize approval status
approval = (approval_status or '').lower().strip()
if approval == 'approved':
return 'active' if has_subscription else 'pre_validated'
elif approval == 'pending':
return 'payment_pending'
elif approval == 'unapproved':
return 'inactive'
else:
# Empty or unknown approval status
return 'pre_validated'
# ============================================================================
# Data Validation and Standardization
# ============================================================================
def standardize_phone(phone: str) -> str:
"""
Standardize phone number by extracting digits only.
Removes all non-digit characters:
- (713) 560-7850 → 7135607850
- 713-725-8902 → 7137258902
- Empty/None → 0000000000 (fallback)
Args:
phone: Phone number in any format
Returns:
10-digit phone number string (or 0000000000 if invalid)
Examples:
>>> standardize_phone('(713) 560-7850')
'7135607850'
>>> standardize_phone('713-725-8902')
'7137258902'
>>> standardize_phone('')
'0000000000'
"""
if not phone or pd.isna(phone):
return '0000000000'
# Extract all digits
digits = re.sub(r'\D', '', str(phone))
# Return 10 digits or fallback
if len(digits) == 10:
return digits
elif len(digits) == 11 and digits[0] == '1':
# Remove leading 1 (US country code)
return digits[1:]
else:
logger.warning(f"Invalid phone format: {phone} (extracted: {digits})")
return '0000000000'
def validate_dob(dob_str: str) -> Tuple[Optional[datetime], Optional[str]]:
"""
Validate and parse date of birth.
Validation rules:
- Must be in MM/DD/YYYY format
- Year must be between 1900 and current year
- Cannot be in the future
- Reject year 0000 or 2025+ (data quality issues in WordPress export)
Args:
dob_str: Date of birth string in MM/DD/YYYY format
Returns:
Tuple of (parsed_datetime, warning_message)
- parsed_datetime: datetime object if valid, None if invalid
- warning_message: Descriptive error message if invalid, None if valid
Examples:
>>> validate_dob('08/02/1962')
(datetime(1962, 8, 2), None)
>>> validate_dob('08/02/0000')
(None, 'Invalid year: 0000')
>>> validate_dob('08/02/2025')
(None, 'Date is in the future')
"""
if not dob_str or pd.isna(dob_str):
return None, 'Missing date of birth'
try:
# Parse MM/DD/YYYY format
parsed = datetime.strptime(str(dob_str).strip(), '%m/%d/%Y')
# Validate year range
if parsed.year == 0:
return None, 'Invalid year: 0000 (data quality issue)'
elif parsed.year < 1900:
return None, f'Year too old: {parsed.year} (likely invalid)'
elif parsed.year > datetime.now().year:
return None, f'Date is in the future: {parsed.year}'
elif parsed > datetime.now():
return None, 'Date is in the future'
return parsed, None
except ValueError as e:
return None, f'Invalid date format: {dob_str} (expected MM/DD/YYYY)'
# ============================================================================
# CSV Analysis and Preview Generation
# ============================================================================
def analyze_csv(file_path: str, existing_emails: Optional[set] = None) -> Dict:
"""
Analyze WordPress CSV file and generate preview data with status suggestions.
This is the main entry point for CSV processing. It:
1. Reads and parses the CSV file
2. Validates each row and generates warnings
3. Maps WordPress roles to LOAF roles
4. Suggests status for each user
5. Tracks data quality metrics
6. Checks for duplicate emails (both within CSV and against existing database)
7. Returns comprehensive analysis and preview data
Args:
file_path: Path to WordPress CSV export file
existing_emails: Set of emails already in the database (optional)
Returns:
Dictionary containing:
- total_rows: Total number of user rows
- valid_rows: Number of rows without critical errors
- warnings: Total warning count
- errors: Total critical error count
- preview_data: List of row dictionaries with suggestions
- data_quality: Dictionary of data quality metrics
Example output:
{
'total_rows': 183,
'valid_rows': 176,
'warnings': 66,
'errors': 7,
'preview_data': [
{
'row_number': 1,
'email': 'user@example.com',
'first_name': 'John',
'last_name': 'Doe',
'phone': '7135607850',
'date_of_birth': '1962-08-02',
'wordpress_roles': ['registered', 'pms_subscription_plan_63'],
'suggested_role': 'member',
'suggested_status': 'active',
'warnings': [],
'errors': []
},
...
],
'data_quality': {
'invalid_dob': 66,
'missing_phone': 7,
'duplicate_email_csv': 0,
'duplicate_email_db': 3,
'unparseable_roles': 2
}
}
"""
import pandas as pd
# Read CSV with pandas
df = pd.read_csv(file_path)
total_rows = len(df)
preview_data = []
data_quality = {
'invalid_dob': 0,
'missing_phone': 0,
'duplicate_email_csv': 0,
'duplicate_email_db': 0,
'unparseable_roles': 0,
'missing_email': 0
}
# Track seen emails for CSV duplicate detection
seen_emails = {}
# Convert existing_emails to set if provided
if existing_emails is None:
existing_emails = set()
for idx, row in df.iterrows():
row_num = idx + 1
warnings = []
errors = []
# Extract and validate email
email = str(row.get('user_email', '')).strip().lower()
if not email or email == 'nan':
errors.append('Missing email address')
data_quality['missing_email'] += 1
else:
# Check for duplicates within CSV
if email in seen_emails:
errors.append(f'Duplicate email in CSV (also in row {seen_emails[email]})')
data_quality['duplicate_email_csv'] += 1
# Check for duplicates in existing database
elif email in existing_emails:
errors.append(f'Email already exists in database')
data_quality['duplicate_email_db'] += 1
else:
seen_emails[email] = row_num
# Extract basic fields
first_name = str(row.get('first_name', '')).strip()
last_name = str(row.get('last_name', '')).strip()
# Parse and validate DOB
dob_parsed, dob_warning = validate_dob(row.get('date_of_birth'))
if dob_warning:
warnings.append(dob_warning)
data_quality['invalid_dob'] += 1
# Standardize phone
phone = standardize_phone(row.get('cell_phone'))
if phone == '0000000000':
warnings.append('Missing or invalid phone number')
data_quality['missing_phone'] += 1
# Parse WordPress roles
wp_capabilities = row.get('wp_capabilities', '')
wp_roles = parse_php_serialized(wp_capabilities)
if not wp_roles and wp_capabilities:
warnings.append('Could not parse WordPress roles')
data_quality['unparseable_roles'] += 1
# Map to LOAF role and status
loaf_role, role_suggested_status = map_wordpress_role(wp_roles)
# Determine if user has subscription
has_subscription = 'pms_subscription_plan_63' in wp_roles
# Get approval status
approval_status = str(row.get('wppb_approval_status', '')).strip()
# Suggest final status
if role_suggested_status:
# Admin roles have fixed status from role mapping
suggested_status = role_suggested_status
else:
# Regular users get status from approval logic
suggested_status = suggest_status(approval_status, has_subscription, loaf_role)
# Build preview row
preview_row = {
'row_number': row_num,
'email': email,
'first_name': first_name,
'last_name': last_name,
'phone': phone,
'date_of_birth': dob_parsed.isoformat() if dob_parsed else None,
'wordpress_user_id': int(row.get('ID', 0)) if pd.notna(row.get('ID')) else None,
'wordpress_registered': str(row.get('user_registered', '')),
'wordpress_roles': wp_roles,
'wordpress_approval_status': approval_status,
'has_subscription': has_subscription,
'suggested_role': loaf_role,
'suggested_status': suggested_status,
'warnings': warnings,
'errors': errors,
'newsletter_consent': str(row.get('newsletter_consent', '')).lower() == 'yes',
'newsletter_checklist': str(row.get('newsletter_checklist', '')).lower() == 'yes'
}
preview_data.append(preview_row)
# Calculate summary statistics
valid_rows = sum(1 for row in preview_data if not row['errors'])
total_warnings = sum(len(row['warnings']) for row in preview_data)
total_errors = sum(len(row['errors']) for row in preview_data)
return {
'total_rows': total_rows,
'valid_rows': valid_rows,
'warnings': total_warnings,
'errors': total_errors,
'preview_data': preview_data,
'data_quality': data_quality
}
# ============================================================================
# Utility Functions
# ============================================================================
def get_status_badge_color(status: str) -> str:
"""
Get appropriate badge color for status display in UI.
Args:
status: User status string
Returns:
Tailwind CSS color class
"""
colors = {
'active': 'bg-green-100 text-green-800',
'pre_validated': 'bg-blue-100 text-blue-800',
'payment_pending': 'bg-yellow-100 text-yellow-800',
'inactive': 'bg-gray-100 text-gray-800',
'pending_email': 'bg-purple-100 text-purple-800',
'awaiting_event': 'bg-indigo-100 text-indigo-800'
}
return colors.get(status, 'bg-gray-100 text-gray-800')
def format_preview_for_display(preview_data: List[Dict], page: int = 1, page_size: int = 50) -> Dict:
"""
Format preview data for paginated display in frontend.
Args:
preview_data: Full preview data list
page: Page number (1-indexed)
page_size: Number of rows per page
Returns:
Dictionary with paginated data and metadata
"""
total_pages = (len(preview_data) + page_size - 1) // page_size
start_idx = (page - 1) * page_size
end_idx = start_idx + page_size
return {
'page': page,
'page_size': page_size,
'total_pages': total_pages,
'total_rows': len(preview_data),
'rows': preview_data[start_idx:end_idx]
}
# ============================================================================
# Module Initialization
# ============================================================================
# Import pandas for CSV processing
try:
import pandas as pd
except ImportError:
logger.error("pandas library not found. Please install: pip install pandas")
raise
logger.info("WordPress parser module loaded successfully")