read_tool
Extract and process single-cell RNA sequencing data from multiple file formats (h5ad, 10x, text files) or directories. Supports memory-efficient backed modes, URL retrieval, and customizable parsing options for analysis.
Instructions
Read data from various file formats (h5ad, 10x, text files, etc.) or directory path.
Input Schema
Name | Required | Description | Default |
---|---|---|---|
backed | No | If 'r', load AnnData in 'backed' mode instead of fully loading it into memory ('memory' mode). If you want to modify backed attributes of the AnnData object, you need to choose 'r+'. | |
backup_url | No | Retrieve the file from an URL if not present on disk. | |
cache | No | If False, read from source, if True, read from fast 'h5ad' cache. | |
cache_compression | No | See the h5py dataset_compression. (Default: settings.cache_compression) | |
delimiter | No | Delimiter that separates data within text file. If None, will split at arbitrary number of white spaces, which is different from enforcing splitting at any single white space. | |
ext | No | Extension that indicates the file type. If None, uses extension of filename. | |
filename | Yes | Path to the file to read. | |
first_column_names | No | Assume the first column stores row names. This is only necessary if these are not strings: strings in the first column are automatically assumed to be row names. | |
first_column_obs | No | If True, assume the first column stores observations (cell or barcode) names when provide text file. If False, the data will be transposed. | |
gex_only | No | Only keep 'Gene Expression' data and ignore other feature types, e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'. Used for 10x formats. | |
make_unique | No | Whether to make the variables index unique by appending '-1', '-2' etc. or not. Used for 10x mtx format. | |
prefix | No | Any prefix before matrix.mtx, genes.tsv and barcodes.tsv. For instance, if the files are named patientA_matrix.mtx, patientA_genes.tsv and patientA_barcodes.tsv the prefix is patientA_. Used for 10x mtx format. | |
sampleid | No | Sample identifier to mark and distinguish different samples. | |
sheet | No | Name of sheet/table in hdf5 or Excel file. | |
var_names | No | The variables index for 10x mtx format. Either 'gene_symbols' or 'gene_ids'. | gene_symbols |
Input Schema (JSON Schema)
{
"description": "Input schema for the read tool.",
"properties": {
"backed": {
"anyOf": [
{
"enum": [
"r",
"r+"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "If 'r', load AnnData in 'backed' mode instead of fully loading it into memory ('memory' mode). If you want to modify backed attributes of the AnnData object, you need to choose 'r+'.",
"title": "Backed"
},
"backup_url": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Retrieve the file from an URL if not present on disk.",
"title": "Backup Url"
},
"cache": {
"default": false,
"description": "If False, read from source, if True, read from fast 'h5ad' cache.",
"title": "Cache",
"type": "boolean"
},
"cache_compression": {
"anyOf": [
{
"enum": [
"gzip",
"lzf"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "See the h5py dataset_compression. (Default: settings.cache_compression)",
"title": "Cache Compression"
},
"delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Delimiter that separates data within text file. If None, will split at arbitrary number of white spaces, which is different from enforcing splitting at any single white space.",
"title": "Delimiter"
},
"ext": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Extension that indicates the file type. If None, uses extension of filename.",
"title": "Ext"
},
"filename": {
"description": "Path to the file to read.",
"title": "Filename",
"type": "string"
},
"first_column_names": {
"default": false,
"description": "Assume the first column stores row names. This is only necessary if these are not strings: strings in the first column are automatically assumed to be row names.",
"title": "First Column Names",
"type": "boolean"
},
"first_column_obs": {
"default": true,
"description": "If True, assume the first column stores observations (cell or barcode) names when provide text file. If False, the data will be transposed.",
"title": "First Column Obs",
"type": "boolean"
},
"gex_only": {
"default": true,
"description": "Only keep 'Gene Expression' data and ignore other feature types, e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'. Used for 10x formats.",
"title": "Gex Only",
"type": "boolean"
},
"make_unique": {
"default": true,
"description": "Whether to make the variables index unique by appending '-1', '-2' etc. or not. Used for 10x mtx format.",
"title": "Make Unique",
"type": "boolean"
},
"prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Any prefix before matrix.mtx, genes.tsv and barcodes.tsv. For instance, if the files are named patientA_matrix.mtx, patientA_genes.tsv and patientA_barcodes.tsv the prefix is patientA_. Used for 10x mtx format.",
"title": "Prefix"
},
"sampleid": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Sample identifier to mark and distinguish different samples.",
"title": "Sampleid"
},
"sheet": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of sheet/table in hdf5 or Excel file.",
"title": "Sheet"
},
"var_names": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "gene_symbols",
"description": "The variables index for 10x mtx format. Either 'gene_symbols' or 'gene_ids'.",
"title": "Var Names"
}
},
"required": [
"filename"
],
"title": "ReadModel",
"type": "object"
}