import pandas as pdimport numpy as npfrom datetime import datetimeimport calendarimport jsonfrom great_tables import GT, mddef create_standard_gt_table(data, title, subtitle="", align_left_cols=None, align_center_cols=None, col_widths=None, include_source=True):"""Create a standardized Great Tables table with common formatting"""# Add USAJobs data attribution to subtitle if not already includedif include_source and subtitle and"USAJobs"notin subtitle: subtitle =f"{subtitle} | USAJobs Historical Data"elif include_source andnot subtitle: subtitle ="USAJobs Historical Data"# Start with basic table gt_table = GT(data.reset_index(drop=True)) gt_table = gt_table.tab_header(title=title, subtitle=subtitle) gt_table = gt_table.tab_source_note(md("*Source: github.com/abigailhaddad/usajobs_historical*"))# Apply alignmentsif align_left_cols: gt_table = gt_table.cols_align(align="left", columns=align_left_cols)if align_center_cols: gt_table = gt_table.cols_align(align="center", columns=align_center_cols)# Apply widthsif col_widths: gt_table = gt_table.cols_width(col_widths)# Apply options without width constraints gt_table = gt_table.tab_options( quarto_disable_processing=True )return gt_tabledef get_current_datetime():"""Get current date and time info for consistent usage""" current_date = datetime.now()return {'date': current_date,'year': current_date.year,'month': current_date.month,'formatted': current_date.strftime('%Y-%m-%d %H:%M:%S') }def extract_series(job_categories):"""Extract occupational series from JobCategories JSON field"""try:if pd.isna(job_categories):return'Unknown' categories = json.loads(job_categories)if categories andlen(categories) >0and'series'in categories[0]:return categories[0]['series']return'Unknown'except:return'Unknown'def categorize_appointment(appt_type):"""Categorize appointment types into Permanent, Term/Temporary, or Other"""if appt_type =='Permanent':return'Permanent'elif appt_type in ['Term', 'Temporary', 'Seasonal', 'Summer', 'Intermittent', 'Internships']:return'Term/Temporary'else:return'Other'def load_nps_data():"""Load and prepare National Park Service job data"""# Load all years from 2018 onwards years =range(2018, 2026) all_data = [] year_counts = []for year in years:# Load historical datatry: df = pd.read_parquet(f'../../data/historical_jobs_{year}.parquet') year_counts.append({'Year': year, 'Jobs Loaded': f"{len(df):,}"}) all_data.append(df)exceptFileNotFoundError: year_counts.append({'Year': year, 'Jobs Loaded': "No data"})# Load current data if available and deduplicatetry: current_df = pd.read_parquet(f'../../data/current_jobs_{year}.parquet')iflen(current_df) >0:# Deduplicate by usajobsControlNumber before combining existing_control_numbers =set(df['usajobsControlNumber']) if'df'inlocals() elseset() new_current_jobs = current_df[~current_df['usajobsControlNumber'].isin(existing_control_numbers)]iflen(new_current_jobs) >0: all_data.append(new_current_jobs) year_counts[-1]['Jobs Loaded'] +=f" + {len(new_current_jobs):,} current"exceptFileNotFoundError:pass# Create data loading summary table loading_summary = pd.DataFrame(year_counts)# Combine all years combined_df = pd.concat(all_data, ignore_index=True)# Convert dates with mixed format handling combined_df['positionOpenDate'] = pd.to_datetime(combined_df['positionOpenDate'], format='mixed') combined_df['year'] = combined_df['positionOpenDate'].dt.year combined_df['month'] = combined_df['positionOpenDate'].dt.month# Dynamically determine the last complete month# If we're on the 2nd of the month or later, consider the previous month complete today = datetime.now()if today.day >=2: last_complete_year = today.year last_complete_month = today.month -1if last_complete_month ==0: last_complete_month =12 last_complete_year = today.year -1else:# If it's the 1st, use two months ago as the last complete month last_complete_year = today.year last_complete_month = today.month -2if last_complete_month <=0: last_complete_month =12+ last_complete_month last_complete_year = today.year -1# Filter to only include data through the last complete month combined_df = combined_df[ (combined_df['year'] < last_complete_year) | ((combined_df['year'] == last_complete_year) & (combined_df['month'] <= last_complete_month)) ].copy()print(f"Data includes postings through {calendar.month_name[last_complete_month]}{last_complete_year}")# Filter for National Park Service nps_df = combined_df[combined_df['hiringAgencyName'] =='National Park Service'].copy()# Extract occupational series and categorize appointments nps_df['occupational_series'] = nps_df['JobCategories'].apply(extract_series) nps_df['appt_category'] = nps_df['appointmentType'].apply(categorize_appointment)# Create summary stats loading_stats = pd.DataFrame({'Metric': ['Total jobs loaded', 'National Park Service jobs', 'Data coverage'],'Value': [f"{len(combined_df):,}",f"{len(nps_df):,}",f"{len(year_counts)} years (2018-{calendar.month_name[last_complete_month]}{last_complete_year})" ] })return nps_df, loading_summary, loading_stats# Load datanps_df, loading_summary, loading_stats = load_nps_data()# Create filtered datasets for year-over-year comparison# For 2025, use all available months; for historical, use the same monthsmax_2025_month = nps_df[nps_df['year'] ==2025]['month'].max() iflen(nps_df[nps_df['year'] ==2025]) >0else12comparison_months =list(range(1, max_2025_month +1))month_names = [calendar.month_name[m][:3] for m in comparison_months]comparison_period =f"{month_names[0]}-{month_names[-1]}"# Filter datasets to same months for fair comparisonnps_comparison_months = nps_df[nps_df['month'].isin(comparison_months)].copy()nps_2025_comparison = nps_comparison_months[nps_comparison_months['year'] ==2025]nps_historical_comparison = nps_comparison_months[nps_comparison_months['year'].between(2018, 2024)]# Display data loading summary as Great Tablegt_loading_stats = ( GT(loading_stats.reset_index(drop=True)) .tab_header( title="Data Loading & Filtering Summary", subtitle="USAJobs Data Processing Results" ) .cols_align( align="left", columns=["Metric"] ) .cols_align( align="center", columns=["Value"] ) .cols_width({"Metric": "60%","Value": "40%" }) .tab_options(quarto_disable_processing=True))gt_loading_stats.show()# Show appointment type categorization as Great Tableappt_breakdown = pd.DataFrame({'Appointment Type': nps_df['appointmentType'].value_counts().index,'Count': nps_df['appointmentType'].value_counts().values,'Category': [categorize_appointment(x) for x in nps_df['appointmentType'].value_counts().index]})gt_appt = ( create_standard_gt_table( data=appt_breakdown, title="Appointment Type Categorization", subtitle="National Park Service Job Types (2018-2025)", align_left_cols=["Appointment Type", "Category"], align_center_cols=["Count"], col_widths={"Appointment Type": "45%", "Count": "20%", "Category": "35%"} ) .fmt_number(columns=["Count"], sep_mark=",", decimals=0))gt_appt.show()
Data includes postings through August 2025
Data Loading & Filtering Summary
USAJobs Data Processing Results
Metric
Value
Total jobs loaded
2,752,190
National Park Service jobs
38,295
Data coverage
8 years (2018-August 2025)
Appointment Type Categorization
National Park Service Job Types (2018-2025) | USAJobs Historical Data
# Get current date to limit displaydt_info = get_current_datetime()current_year = dt_info['year']current_month = dt_info['month']# Get the last complete month from our data (calculated during loading)# This will be used to filter out future months in visualizationslast_complete_info = nps_df['year'].max(), nps_df[nps_df['year'] == nps_df['year'].max()]['month'].max()def should_show_month(year, month): last_year, last_month = last_complete_infoif year < last_year:returnTrueelif year == last_year:return month <= last_monthelse:returnFalse# ConstantsMONTH_LABELS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']def create_heatmap_table(df_subset, title, subtitle=""):"""Create a Great Tables heatmap-style table""" monthly_breakdown = df_subset.groupby(['year', 'month']).size().reset_index(name='job_count') monthly_pivot = monthly_breakdown.pivot(index='month', columns='year', values='job_count').fillna(0)# Mask future monthsfor year in monthly_pivot.columns:for month in monthly_pivot.index:ifnot should_show_month(year, month): monthly_pivot.loc[month, year] = np.nan# Add month names monthly_pivot.index = MONTH_LABELS# Reset index to make month a column monthly_pivot_reset = monthly_pivot.reset_index() monthly_pivot_reset.columns.name =None monthly_pivot_reset = monthly_pivot_reset.rename(columns={'index': 'Month'})# Get year columns for formatting - convert to strings to ensure proper handling year_cols = [str(col) for col in monthly_pivot_reset.columns ifstr(col) !='Month']# Create color scale values for the data max_val = monthly_pivot.max().max()# Rename columns to strings for Great Tables monthly_pivot_reset.columns = [str(col) for col in monthly_pivot_reset.columns]# Keep subtitle as-is for heatmaps (they already have repo link in footnote) gt_heatmap = ( GT(monthly_pivot_reset) .tab_header(title=title, subtitle=subtitle) .tab_source_note(md("*Source: github.com/abigailhaddad/usajobs_historical*")) .fmt_number(columns=year_cols, decimals=0, sep_mark=",") .data_color( columns=year_cols, palette=["white", "orange", "darkred"], domain=[0, max_val], na_color="lightgray" ) .cols_align(align="center", columns=year_cols) .cols_align(align="left", columns=["Month"]) .tab_options(quarto_disable_processing=True) ) gt_heatmap.show()# Create heatmap summary datadef create_appointment_summary(df):"""Create summary of job counts by appointment category""" permanent_count =len(df[df['appt_category'] =='Permanent']) temp_count =len(df[df['appt_category'] =='Term/Temporary']) total_count =len(df)return pd.DataFrame({'Category': ['All NPS Positions', 'Permanent Positions', 'Term/Temporary Positions'],'Job Count': [f"{total_count:,}", f"{permanent_count:,}", f"{temp_count:,}"],'Percentage': ["100%", f"{permanent_count/total_count*100:.0f}%", f"{temp_count/total_count*100:.0f}%"] })heatmap_summary = create_appointment_summary(nps_df)gt_heatmap_summary = create_standard_gt_table( data=heatmap_summary, title="Heatmap Categories Summary", subtitle="National Park Service Job Distribution by Appointment Type", align_left_cols=["Category"], align_center_cols=["Job Count", "Percentage"], col_widths={"Category": "50%", "Job Count": "25%", "Percentage": "25%"})gt_heatmap_summary.show()# 1. All NPS jobscreate_heatmap_table(nps_df, "National Park Service - All USAJobs Postings by Month and Year","")# 2. Permanent positions onlypermanent_df = nps_df[nps_df['appt_category'] =='Permanent']create_heatmap_table(permanent_df, "National Park Service - Permanent USAJobs Positions","")# 3. Term/Temporary positions only temp_df = nps_df[nps_df['appt_category'] =='Term/Temporary']create_heatmap_table(temp_df, "National Park Service - Term/Temporary USAJobs Positions","")
Heatmap Categories Summary
National Park Service Job Distribution by Appointment Type | USAJobs Historical Data