<?xml version="1.0" encoding="UTF-8"?>
<item xmlns="http://omeka.org/schemas/omeka-xml/v5" itemId="1189" public="1" featured="0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://omeka.org/schemas/omeka-xml/v5 http://omeka.org/schemas/omeka-xml/v5/omeka-xml-5-0.xsd" uri="https://socictopen.socict.org/items/show/1189?output=omeka-xml" accessDate="2026-04-20T23:20:21+00:00">
  <fileContainer>
    <file fileId="1189">
      <src>https://socictopen.socict.org/files/original/f6fc09b803a3e9a7358a71f4f3d4b713.pdf</src>
      <authentication>531c77d74f699c6f7571df467b25d2be</authentication>
    </file>
  </fileContainer>
  <collection collectionId="1">
    <elementSetContainer>
      <elementSet elementSetId="1">
        <name>Dublin Core</name>
        <description>The Dublin Core metadata element set is common to all Omeka records, including items, files, and collections. For more information see, http://dublincore.org/documents/dces/.</description>
        <elementContainer>
          <element elementId="50">
            <name>Title</name>
            <description>A name given to the resource</description>
            <elementTextContainer>
              <elementText elementTextId="1">
                <text>Coronavirus</text>
              </elementText>
            </elementTextContainer>
          </element>
          <element elementId="41">
            <name>Description</name>
            <description>An account of the resource</description>
            <elementTextContainer>
              <elementText elementTextId="2">
                <text>Dominio científico: Coronavirus</text>
              </elementText>
            </elementTextContainer>
          </element>
        </elementContainer>
      </elementSet>
    </elementSetContainer>
  </collection>
  <itemType itemTypeId="1">
    <name>Text</name>
    <description>A resource consisting primarily of words for reading. Examples include books, letters, dissertations, poems, newspapers, articles, archives of mailing lists. Note that facsimiles or images of texts are still of the genre Text.</description>
  </itemType>
  <elementSetContainer>
    <elementSet elementSetId="1">
      <name>Dublin Core</name>
      <description>The Dublin Core metadata element set is common to all Omeka records, including items, files, and collections. For more information see, http://dublincore.org/documents/dces/.</description>
      <elementContainer>
        <element elementId="50">
          <name>Title</name>
          <description>A name given to the resource</description>
          <elementTextContainer>
            <elementText elementTextId="11335">
              <text>Better quality score compression through sequence-based quality smoothing</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="39">
          <name>Creator</name>
          <description>An entity primarily responsible for making the resource</description>
          <elementTextContainer>
            <elementText elementTextId="11336">
              <text>Yoshihiro Shibuya, Matteo Comin</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="41">
          <name>Description</name>
          <description>An account of the resource</description>
          <elementTextContainer>
            <elementText elementTextId="11337">
              <text>Abstract Motivation Current NGS techniques are becoming exponentially cheaper. As a result, there is an exponential growth of genomic data unfortunately not followed by an exponential growth of storage, leading to the necessity of compression. Most of the entropy of NGS data lies in the quality values associated to each read. Those values are often more diversified than necessary. Because of that, many tools such as Quartz or GeneCodeq, try to change (smooth) quality scores in order to improve compressibility without altering the important information they carry for downstream analysis like SNP calling. Results We use the FM-Index, a type of compressed suffix array, to reduce the storage requirements of a dictionary of k-mers and an effective smoothing algorithm to maintain high precision for SNP calling pipelines, while reducing quality scores entropy. We present YALFF (Yet Another Lossy Fastq Filter), a tool for quality scores compression by smoothing leading to improved compressibility of FASTQ files. The succinct k-mers dictionary allows YALFF to run on consumer computers with only 5.7 GB of available free RAM. YALFF smoothing algorithm can improve genotyping accuracy while using less resources. Availability https://github.com/yhhshb/yalff</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="40">
          <name>Date</name>
          <description>A point or period of time associated with an event in the lifecycle of the resource</description>
          <elementTextContainer>
            <elementText elementTextId="11338">
              <text>2019</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="49">
          <name>Subject</name>
          <description>The topic of the resource</description>
          <elementTextContainer>
            <elementText elementTextId="11339">
              <text>FASTQ compression, BWT, FM-Index</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="43">
          <name>Identifier</name>
          <description>An unambiguous reference to the resource within a given context</description>
          <elementTextContainer>
            <elementText elementTextId="11340">
              <text>DOI: 10.1186/s12859-019-2883-5</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="48">
          <name>Source</name>
          <description>A related resource from which the described resource is derived</description>
          <elementTextContainer>
            <elementText elementTextId="11341">
              <text>BMC Bioinformatics</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="45">
          <name>Publisher</name>
          <description>An entity responsible for making the resource available</description>
          <elementTextContainer>
            <elementText elementTextId="11342">
              <text>BMC</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="38">
          <name>Coverage</name>
          <description>The spatial or temporal topic of the resource, the spatial applicability of the resource, or the jurisdiction under which the resource is relevant</description>
          <elementTextContainer>
            <elementText elementTextId="11343">
              <text>Biology (General), Computer applications to medicine. Medical informatics</text>
            </elementText>
          </elementTextContainer>
        </element>
        <element elementId="44">
          <name>Language</name>
          <description>A language of the resource</description>
          <elementTextContainer>
            <elementText elementTextId="11344">
              <text>EN</text>
            </elementText>
          </elementTextContainer>
        </element>
      </elementContainer>
    </elementSet>
  </elementSetContainer>
</item>
