Skip to content

Commit 2809e8e

Browse files
Merge pull request #170 from databio/dev_io
Release 0.4.1
2 parents 440f979 + 42f3f03 commit 2809e8e

File tree

13 files changed

+100
-32
lines changed

13 files changed

+100
-32
lines changed
Binary file not shown.
Binary file not shown.

data/geniml_bb_cache/tokens.zarr/.zgroup

-3
This file was deleted.

geniml/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.0"
1+
__version__ = "0.4.1"

geniml/bbclient/cli.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from logging import getLogger
22

3-
from .const import MODULE_NAME
3+
from .const import MODULE_NAME, DEFAULT_CACHE_FOLDER
44

55
_LOGGER = getLogger(MODULE_NAME)
66

@@ -12,7 +12,7 @@ def build_subparser_cache_bed(parser):
1212
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
1313
parser.add_argument(
1414
"--cache-folder",
15-
default=None,
15+
default=DEFAULT_CACHE_FOLDER,
1616
help="Cache folder path (default: bed_cache)",
1717
)
1818

@@ -26,7 +26,7 @@ def build_subparser_cache_bedset(parser):
2626
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
2727
parser.add_argument(
2828
"--cache-folder",
29-
default=None,
29+
default=DEFAULT_CACHE_FOLDER,
3030
help="Cache folder path (default: bed_cache)",
3131
)
3232

@@ -40,7 +40,7 @@ def build_subparser_seek(parser):
4040
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
4141
parser.add_argument(
4242
"--cache-folder",
43-
default=None,
43+
default=DEFAULT_CACHE_FOLDER,
4444
help="Cache folder path (default: bed_cache)",
4545
)
4646

@@ -53,7 +53,7 @@ def build_subparser_inspect(parser):
5353
"""
5454
parser.add_argument(
5555
"--cache-folder",
56-
default=None,
56+
default=DEFAULT_CACHE_FOLDER,
5757
help="Cache folder path (default: bed_cache)",
5858
)
5959

@@ -75,7 +75,7 @@ def build_subparser_cache_tokens(parser):
7575
)
7676
parser.add_argument(
7777
"--cache-folder",
78-
default=None,
78+
default=DEFAULT_CACHE_FOLDER,
7979
help="Cache folder path (default: bed_cache)",
8080
)
8181

@@ -89,7 +89,7 @@ def build_subparser_remove(parser):
8989
parser.add_argument("identifier", nargs=1, help="BED file identifier, url, or file path")
9090
parser.add_argument(
9191
"--cache-folder",
92-
default=None,
92+
default=DEFAULT_CACHE_FOLDER,
9393
help="Cache folder path (default: bed_cache)",
9494
)
9595

geniml/cli.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,18 @@ def main(test_args=None):
114114
)
115115

116116
if args.command == "bbclient":
117-
if args.subcommand is not None:
117+
if args.subcommand in [
118+
"cache-bed",
119+
"cache-tokens",
120+
"cache-bedset",
121+
"seek",
122+
"inspect",
123+
"rm",
124+
]:
118125
_LOGGER.info(f"Subcommand: {args.subcommand}")
119126
from .bbclient import BBClient
120127

121-
bbc = BBClient()
128+
bbc = BBClient(cache_folder=args.cache_folder)
122129

123130
else:
124131
# if no subcommand, print help format of bbclient subparser
@@ -141,11 +148,8 @@ def main(test_args=None):
141148
if args.subcommand == "cache-bed":
142149
# if input is a BED file path
143150
if os.path.exists(args.identifier[0]):
144-
from .io import RegionSet
145-
146-
bedfile = RegionSet(args.identifier[0])
147-
bbc.add_bed_to_cache(bedfile)
148-
_LOGGER.info(f"BED file {bedfile.compute_bed_identifier()} has been cached")
151+
identifier = bbc.add_bed_to_cache(args.identifier[0])
152+
_LOGGER.info(f"BED file {identifier} has been cached")
149153
else:
150154
bbc.load_bed(args.identifier[0])
151155

geniml/io/io.py

+36-5
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
5555
:param regions: path, or url to bed file or list of Region objects
5656
:param backed: whether to load the bed file into memory or not [Default: False]
5757
"""
58-
# load from file
58+
self._df: Union[pd.DataFrame, None] = None
59+
5960
if isinstance(regions, str):
6061
self.backed = backed
6162
self.regions: List[Region] = []
6263
self.path = regions
63-
6464
self.regions = None
6565
self.is_gzipped = False
6666

@@ -90,6 +90,7 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
9090
df = self._read_gzipped_file(regions)
9191
else:
9292
df = self._read_file_pd(regions, sep="\t", header=None, engine="pyarrow")
93+
self._df = df
9394

9495
_regions = []
9596
df.apply(
@@ -111,6 +112,15 @@ def __init__(self, regions: Union[str, List[Region]], backed: bool = False):
111112

112113
self._identifier = None
113114

115+
def to_pandas(self) -> Union[pd.DataFrame, None]:
116+
if self._df is None:
117+
seqnames, starts, ends = zip(
118+
*[(region.chr, region.start, region.end) for region in self]
119+
)
120+
return pd.DataFrame([seqnames, starts, ends])
121+
122+
return self._df
123+
114124
def _read_gzipped_file(self, file_path: str) -> pd.DataFrame:
115125
"""
116126
Read a gzipped file into a pandas dataframe
@@ -140,12 +150,33 @@ def _read_file_pd(self, *args, **kwargs) -> pd.DataFrame:
140150
if row_count > 0:
141151
_LOGGER.info(f"Skipped {row_count} rows while standardization. File: '{args}'")
142152
df = df.dropna(axis=1)
143-
return df
153+
for index, row in df.iterrows():
154+
if (
155+
isinstance(row[0], str)
156+
and isinstance(row[1], int)
157+
and isinstance(row[2], int)
158+
):
159+
return df
160+
else:
161+
if isinstance(row[1], str):
162+
try:
163+
_ = int(row[1])
164+
df[1] = pd.to_numeric(df[1])
165+
except ValueError:
166+
row_count += 1
167+
break
168+
if isinstance(row[2], str):
169+
try:
170+
_ = int(row[2])
171+
df[2] = pd.to_numeric(df[2])
172+
except ValueError:
173+
row_count += 1
174+
break
175+
return df
144176
except (pd.errors.ParserError, pd.errors.EmptyDataError) as _:
145177
if row_count <= max_rows:
146178
row_count += 1
147-
# if can't open file after 5 attempts try to open it with gzip
148-
return self._read_gzipped_file(*args)
179+
raise BEDFileReadError("Cannot read bed file.")
149180

150181
def __len__(self):
151182
return self.length
+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# THIS is big header
2+
# with 4 lines
3+
# and 3rd line
4+
# is empty
5+
chr1 10 30
6+
chr1 110 130
7+
chr1 210 230
+1-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
# THIS is big header
2-
# with 4 lines
3-
# and 3rd line
4-
# is empty
1+
chrom_name one two
52
chr1 10 30
63
chr1 110 130
74
chr1 210 230

tests/data/io_data/bed_bad/s1_empty.bed

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# THIS is big header
2+
# with 4 lines
3+
# and 3rd line
4+
# is empty
5+
# THIS is big header
6+
# with 4 lines
7+
# and 3rd line
8+
# is empty
9+
chr1 10 30
10+
chr1 110 130
11+
chr1 210 230
File renamed without changes.

tests/test_io.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import os
22

33
import genomicranges
4+
import pandas as pd
45
import pytest
56

6-
from geniml.io.exceptions import GenimlBaseError
7+
from geniml.io.exceptions import GenimlBaseError, BEDFileReadError
78
from geniml.io.io import SNP, Maf, Region, RegionSet
89

910
DATA_TEST_FOLDER = os.path.join(
@@ -14,11 +15,15 @@
1415
)
1516
DATA_TEST_FOLDER_BED = os.path.join(DATA_TEST_FOLDER, "bed")
1617
DATA_TEST_FOLDER_MAF = os.path.join(DATA_TEST_FOLDER, "maf")
18+
DATA_TEST_FOLDER_BED_BAD = os.path.join(DATA_TEST_FOLDER, "bed_bad")
1719

1820
ALL_BEDFILE_PATH = [
1921
os.path.join(DATA_TEST_FOLDER_BED, x) for x in os.listdir(DATA_TEST_FOLDER_BED)
2022
]
2123
ALL_MAF_PATH = [os.path.join(DATA_TEST_FOLDER_MAF, x) for x in os.listdir(DATA_TEST_FOLDER_MAF)]
24+
ALL_BADFILE_BAD_PATH = [
25+
os.path.join(DATA_TEST_FOLDER_BED_BAD, x) for x in os.listdir(DATA_TEST_FOLDER_BED_BAD)
26+
]
2227

2328

2429
def test_make_region():
@@ -51,7 +56,7 @@ class TestRegionSet:
5156
@pytest.mark.parametrize(
5257
"url",
5358
[
54-
"ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz"
59+
"https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz",
5560
],
5661
)
5762
def test_region_set_from_url(self, url):
@@ -70,11 +75,15 @@ def test_region_set_from_path(self, url):
7075
assert isinstance(region, Region)
7176
break
7277

78+
@pytest.mark.parametrize("path", ALL_BADFILE_BAD_PATH)
79+
def test_broken_bed_from_path(self, path):
80+
with pytest.raises(BEDFileReadError):
81+
region_set = RegionSet(path)
82+
7383
@pytest.mark.parametrize(
7484
"url",
75-
[
76-
"ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7666nnn/GSM7666464/suppl/GSM7666464_18134-282-06_S51_L003_peaks.narrowPeak.gz"
77-
], # This is not the right way how to do it!
85+
["https://github.com/databio/geniml/raw/master/tests/data/io_data/bed/s1_a.bed.gz"],
86+
# TODO: This is not the right way how to do it!
7887
)
7988
def test_region_set_from_url_cant_be_backed(self, url):
8089
with pytest.raises(GenimlBaseError):
@@ -105,6 +114,18 @@ def test_calculation_id(self):
105114
assert len(bedfile_id_2) == 32
106115
assert bedfile_id_1 == bedfile_id_2 == bedfile_id_3
107116

117+
@pytest.mark.parametrize("url", ALL_BEDFILE_PATH)
118+
def test_to_df(self, url):
119+
region_set = RegionSet(url, backed=False)
120+
pandas_df = region_set.to_pandas()
121+
assert isinstance(pandas_df, pd.DataFrame)
122+
123+
@pytest.mark.parametrize("url", ALL_BEDFILE_PATH)
124+
def test_to_df_backed(self, url):
125+
region_set = RegionSet(url, backed=True)
126+
pandas_df = region_set.to_pandas()
127+
assert isinstance(pandas_df, pd.DataFrame)
128+
108129

109130
class TestMaff:
110131
@pytest.mark.parametrize("path", ALL_MAF_PATH)

0 commit comments

Comments
 (0)