68-Export fasta files of each MAG based on Quality and binning tool

Bioinformatics Ka Chilla
#!/bin/bash
# ==============================================================================
# 13_export_MAG_fastas_FIXED.sh — Export MAG FASTA files by quality tier
# Searches SUMMARY_*, *_bins/, and das_tool/ directories
# ==============================================================================


set -euo pipefail


eval "$(conda shell.bash hook)"
conda activate anvio-9


BASE_DIR="/media/codanics/bioinformatics/05_anvio_metagenomics"
ANVIO_DIR="${BASE_DIR}/09_anvio"
HQ_DIR="${ANVIO_DIR}/11_HQ_MAGS_FOR_ANALYSIS"
OUTPUT_DIR="${ANVIO_DIR}/12_MAG_FASTA_EXPORTS"


mkdir -p "${OUTPUT_DIR}"


echo "========================================"
echo "Exporting MAG FASTA Files by Quality"
echo "========================================"


# ==========================================
# PART 1: Create folder structure
# ==========================================
echo ""
echo "=== Creating folder structure ==="


python3 - "$HQ_DIR" "$OUTPUT_DIR" << 'PYEOF'
import sys
import pandas as pd
import os


hq_dir = sys.argv[1]
outdir = sys.argv[2]


hq_mq_file = os.path.join(hq_dir, 'HQ_MQ_MAGs_for_analysis.tsv')
df = pd.read_csv(hq_mq_file, sep='t')


print(f"Total HQ+MQ MAGs: {len(df)}")
print(f"By Quality Tier:n{df['Quality_Tier'].value_counts()}")


for tier in ['High_Quality', 'Medium_Quality']:
tier_dir = os.path.join(outdir, tier)
os.makedirs(tier_dir, exist_ok=True)
for domain in ['Bacteria', 'Archaea', 'Eukarya', 'Unclassified']:
os.makedirs(os.path.join(tier_dir, domain), exist_ok=True)


for collection in df['Collection'].unique():
os.makedirs(os.path.join(outdir, f'By_Collection_{collection}'), exist_ok=True)


print(f"nFolders created in: {outdir}")
PYEOF


# ==========================================
# PART 2: Export FASTA files from summary directories
# ==========================================
echo ""
echo "=== Exporting FASTA files ==="


python3 - "$HQ_DIR" "$OUTPUT_DIR" "$ANVIO_DIR" << 'PYEOF'
import sys
import pandas as pd
import os
import shutil


hq_dir = sys.argv[1]
outdir = sys.argv[2]
anvio_dir = sys.argv[3]


hq_mq_file = os.path.join(hq_dir, 'HQ_MQ_MAGs_for_analysis.tsv')
df = pd.read_csv(hq_mq_file, sep='t')


print(f"Processing {len(df)} MAGs...n")


exported = 0
missing = 0


for _, row in df.iterrows():
collection = row['Collection']
bin_name = row['bin_name']
tier = row['Quality_Tier'].replace('-', '_')
domain = row['domain'] if pd.notna(row['domain']) else 'Unclassified'
genus = row['genus'] if pd.notna(row['genus']) and str(row['genus']) != 'Unclassified' else 'Unknown'
completeness = row['Completeness']
contamination = row['Contamination']


# Build list of possible source paths
source_paths = []


# From anvi-summarize bin_by_bin
source_paths.append(os.path.join(anvio_dir, f'SUMMARY_{collection}', 'bin_by_bin', bin_name, f'{bin_name}-contigs.fa'))
source_paths.append(os.path.join(anvio_dir, f'SUMMARY_{collection}', 'bin_by_bin', bin_name, f'{bin_name}.fa'))


# From binner output directories (lowercase collection name)
coll_lower = collection.lower()
if collection == 'DASTOOL':
source_paths.append(os.path.join(anvio_dir, 'das_tool', f'{bin_name}.fa'))
source_paths.append(os.path.join(anvio_dir, 'das_tool', f'{bin_name}.fasta'))
else:
source_paths.append(os.path.join(anvio_dir, f'{coll_lower}_bins', f'{bin_name}.fa'))
source_paths.append(os.path.join(anvio_dir, f'{coll_lower}_bins', f'{bin_name}.fasta'))


# Find the source file
source_fa = None
for sp in source_paths:
if os.path.exists(sp):
source_fa = sp
break


if source_fa is None:
print(f" ⚠ {collection}/{bin_name}: FASTA not found")
print(f" Tried: {source_paths[0]}")
missing += 1
continue


# Create descriptive filename
safe_genus = str(genus).replace(' ', '_').replace('/', '_')
new_name = f"{collection}_{bin_name}_{safe_genus}_comp{completeness:.1f}_cont{contamination:.1f}.fa"


# Copy to tier folder
shutil.copy2(source_fa, os.path.join(outdir, tier, new_name))


# Copy to tier+domain folder
shutil.copy2(source_fa, os.path.join(outdir, tier, domain, new_name))


# Copy to collection folder
shutil.copy2(source_fa, os.path.join(outdir, f'By_Collection_{collection}', new_name))


print(f" ✓ {collection}/{bin_name} → {tier}/{domain}/")
exported += 1


print(f"n{'='*50}")
print(f"Exported: {exported} MAGs")
print(f"Missing: {missing} MAGs")
print(f"{'='*50}")


# Create manifest
manifest_data = []
for _, row in df.iterrows():
collection = row['Collection']
bin_name = row['bin_name']
tier = row['Quality_Tier'].replace('-', '_')
domain = row['domain'] if pd.notna(row['domain']) else 'Unclassified'
genus = row['genus'] if pd.notna(row['genus']) and str(row['genus']) != 'Unclassified' else 'Unknown'


safe_genus = str(genus).replace(' ', '_').replace('/', '_')
filename = f"{collection}_{bin_name}_{safe_genus}_comp{row['Completeness']:.1f}_cont{row['Contamination']:.1f}.fa"


manifest_data.append({
'original_bin': bin_name,
'collection': collection,
'quality_tier': tier,
'domain': domain,
'phylum': row.get('phylum', 'Unknown'),
'genus': genus,
'completeness': row['Completeness'],
'contamination': row['Contamination'],
'filename': filename,
'path_tier': os.path.join(tier, filename),
'path_domain': os.path.join(tier, domain, filename),
'path_collection': os.path.join(f'By_Collection_{collection}', filename)
})


manifest_df = pd.DataFrame(manifest_data)
manifest_df.to_csv(os.path.join(outdir, 'MAG_export_manifest.tsv'), sep='t', index=False)
print(f"nManifest: {os.path.join(outdir, 'MAG_export_manifest.tsv')}")
PYEOF


# ==========================================
# PART 3: Verify exports
# ==========================================
echo ""
echo "=== Verifying exports ==="


python3 - "$OUTPUT_DIR" << 'PYEOF'
import sys
import os


outdir = sys.argv[1]


print("nExported MAG FASTA files:")
print("=" * 60)


total_files = 0
total_size = 0


for root, dirs, files in os.walk(outdir):
dirs[:] = [d for d in dirs if not d.startswith('.')]


fa_files = [f for f in files if f.endswith('.fa') or f.endswith('.fasta')]
if fa_files:
rel_path = os.path.relpath(root, outdir)
size = sum(os.path.getsize(os.path.join(root, f)) for f in fa_files)
total_files += len(fa_files)
total_size += size


print(f"n {rel_path}/")
print(f" {len(fa_files)} files, {size/1024/1024:.1f} MB")
for f in sorted(fa_files):
fpath = os.path.join(root, f)
fsize = os.path.getsize(fpath) / 1024
print(f" {f} ({fsize:.1f} KB)")


print(f"n{'='*60}")
print(f"TOTAL: {total_files} files, {total_size/1024/1024:.1f} MB")
print(f"{'='*60}")
PYEOF


echo ""
echo "========================================"
echo "MAG FASTA EXPORT COMPLETE"
echo "========================================"
echo ""
echo "Output: ${OUTPUT_DIR}"
echo ""
ls -R "${OUTPUT_DIR}" | head -50
echo "========================================"